Line data Source code
1 : /* Generate prototypes, inlines and/or implementations for concurrent
2 : persistent shared maps based on linear probing with some insights
3 : from cuckoo hashing for improved concurrent performance. Notably:
4 :
5 : - Supports an arbitrary number of concurrent operations with a
6 : comparable performance to a single threaded HPC linear probed map
7 : for non-conflicting operations (likely hardware NOC limited for
8 : conflicting operations).
9 :
10 : - Concurrent queries do not interfere with other concurrent queries.
11 :
12 : - All map operations can be serialized.
13 :
14 : - Does not require a key sentinel (but can support them useful for
15 : the application).
16 :
17 : - Does not require a guarantee there's at least one free element in
18 : the element store (but like a serial linear probed map, it is a
19 : good idea to keep utilization below the absolute theoretical
20 : capacity for strong algorithmic performance guarantees).
21 :
22 : - Insert/modify/query have a run-time configurable worst case O(1)
23 : cost, regardless of map fill ratio. (Remove's worst case cost is
24 : not configurable due to cases involving maps filled to or near
25 : capacity. For reasonable fill ratios, remove is also comparable.)
26 :
27 : - Stable in the sense that all keys with the same hash (or more
28 : generally same initial probe element) are ordered in the element
29 : store by insertion order. (This allows the user to use the key
30 : hash to group related keys contiguously in the element store and
31 : then stably iterate over them with fast streaming.)
32 :
33 : - Query requires no atomic operations at all. (Usual target memory
34 : subsystem requirements that writes to memory become visible to
35 : other threads in the order in which they were issued in the machine
36 : code.)
37 :
38 : - Insert/modify/remove only require atomic fetch-and-or (typically
39 : just one). There's no need for an atomic compare-and-swap /
40 : underlying pool of free elements / etc. (Can also be used as a
41 : non-concurrent map on targets without FD_HAS_ATOMIC.)
42 :
43 : - Map concurrency metadata and the actual map element store can be
44 : located in separate memory regions (can also split the element
45 : store over multiple memory regions ... e.g. keys here / values
46 : there) and can back any of these memory regions by a file system to
47 : scale beyond billions of elements with no code change.
48 :
49 : - Map metadata easily fits in CPU caches with a fixed O(1) overhead
50 : regardless of element store capacity. Access patterns naturally
51 : exploit CPU and storage caching, streaming and prefetching
52 : behaviors.
53 :
54 : - Supports asynchronous execution (e.g. issue hints for keys that
55 : will be accessed soon, do unrelated work while hints are
56 : prefetching need info into local cache in the background, then do
57 : key operations ... now all fast and local cache hits).
58 :
59 : - Supports non-plain-old-data keys and non-plain-old-data values
60 : (both of which are gross on real world computers but commonly done
61 : nevertheless).
62 :
63 : A map can be persisted beyond the lifetime of the creating process,
64 : be used inter-process, relocated in memory, be naively
65 : serialized/deserialized, be moved between hosts, etc. Massive
66 : concurrency, high algorithmic and implementation performance for
67 : normal usage and friendly cache / file system streaming access
68 : patterns for heavily loaded / heavily concurrent usage are
69 : prioritized. In particular, unlike fd_map_chain_para, this takes
70 : ownership of the underlying element store for the lifetime of the map
71 : in order to speed up operations and increase concurrency.
72 :
73 : Typical usage:
74 :
75 : struct myele {
76 : ulong key; // Technically "MAP_KEY_T MAP_KEY" (default is ulong key)
77 :
78 : ... key can be located arbitrarily in the element. The mapping
79 : ... of a key to an element in the element store is arbitrary and
80 : ... can move while the key is in the map.
81 : };
82 :
83 : typedef struct myele myele_t;
84 :
85 : #define MAP_NAME mymap
86 : #define MAP_ELE_T myele_t
87 : #include "tmpl/fd_map_slot_para.c"
88 :
89 : will declare the following APIs as a header only style library in the
90 : compilation unit:
91 :
92 : // A mymap_t is a stack declaration friendly quasi-opaque local
93 : // object used to hold the state of a local join to a mymap.
94 : // Similarly, a mymap_query_t / mymap_iter_t holds the local state
95 : // of an ongoing operation / iteration. E.g. it is fine to do
96 : // mymap_t join[1];" to allocate a mymap_t but the contents should
97 : // not be used directly.
98 :
99 : typedef struct mymap_private mymap_t;
100 : typedef struct mymap_query_private mymap_query_t;
101 : typedef struct mymap_iter_private mymap_iter_t;
102 :
103 : // mymap_lock_max returns the maximum number of version locks that
104 : // can be used by a mymap. Will be a positive integer
105 : // power-of-two.
106 :
107 : ulong mymap_lock_max( void );
108 :
109 : // mymap_lock_cnt_est returns a reasonable number of locks to use
110 : // for a map backed by an ele_max capacity element store. Assumes
111 : // ele_max is an integer power-of-two. Returns an integer
112 : // power-of-two in [1,mymap_lock_max()].
113 :
114 : ulong mymap_lock_cnt_est( ulong ele_max );
115 :
116 : // mymap_probe_max_est returns a reasonable maximum probe sequence
117 : // length for a map backed by an ele_max capacity element store.
118 : // Assumes ele_max is an integer power-of-two. Returns an integer
119 : // in [1,ele_max].
120 :
121 : ulong mymap_probe_max_est( ulong ele_max );
122 :
123 : // mymap_{align,footprint} returns the alignment and footprint
124 : // needed for a memory region to be used as a mymap. align will be
125 : // an integer power-of-two and footprint will be a multiple of
126 : // align. ele_max / lock_cnt / probe_max specify the capacity of
127 : // the element store / number of version locks / maximum probe
128 : // sequence length for the map. footprint returns 0 for invalid
129 : // configurations. In a valid configuration, ele_max is an integer
130 : // power-of-two, lock_cnt is an integer power-of-two, lock_cnt is
131 : // at most min( lock_max, ele_max ) and probe_max is in
132 : // [1,ele_max].
133 : //
134 : // mymap_new formats a memory region with the required alignment
135 : // and footprint into a mymap. shmem points in the caller's
136 : // address space to the memory region to use. Returns shmem on
137 : // success (mymap has ownership of the memory region) and NULL on
138 : // failure (no changes, logs details). The caller is not joined on
139 : // return. All map versions will be at version 0 / unlocked. The
140 : // map contents will be in whatever state the backing element store
141 : // is in. IMPORTANT SAFETY TIP! THE ELEMENT STORE SHOULD BE IN A
142 : // CONSISTENT STATE BEFORE USING MYMAP_NEW. For example, the
143 : // caller could mark all elements as free before calling this and
144 : // the caller could use verify immediately after creation to verify
145 : // integrity.
146 : //
147 : // mymap_join joins the caller to an existing mymap. ljoin points
148 : // to a mymap_t compatible memory region in the caller's address
149 : // space, shmap points in the caller's address space to the memory
150 : // region containing the mymap, and shele points in the caller's
151 : // address space to mymap's element store. Returns a handle to the
152 : // caller's local join on success (join has ownership of the ljoin
153 : // region) and NULL on failure (no changes, logs details).
154 : //
155 : // mymap_leave leaves a mymap join. join points to a current local
156 : // join. Returns the memory region used for the join on success
157 : // (caller has ownership on return and the caller is no longer
158 : // joined) and NULL on failure (no changes, logs details). Use the
159 : // join accessors before leaving to get shmap and shele used by the
160 : // join if needed.
161 : //
162 : // mymap_delete unformats a memory region used as a mymap. Assumes
163 : // shmap points in the caller's address space to a memory region
164 : // containing the mymap and that there are no joins. Returns shmem
165 : // on success (caller has ownership of the memory region, any
166 : // remaining elements still in the mymap are released to the caller
167 : // implicitly) and NULL on failure (no changes, logs details).
168 :
169 : ulong mymap_align ( void );
170 : ulong mymap_footprint( ulong chain_cnt );
171 : void * mymap_new ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
172 : mymap_t * mymap_join ( void * ljoin, void * shmap, void * shele );
173 : void * mymap_leave ( mymap_t * join );
174 : void * mymap_delete ( void * shmap );
175 :
176 : // mymap_{ele_max,lock_cnt,probe_max,seed} return the mymap
177 : // configuration. Assumes join is a current local join. The
178 : // values will be valid for the mymap lifetime.
179 :
180 : ulong mymap_ele_max ( mymap_t const * join );
181 : ulong mymap_lock_cnt ( mymap_t const * join );
182 : ulong mymap_probe_max( mymap_t const * join );
183 : ulong mymap_seed ( mymap_t const * join );
184 :
185 : // mymap_{shmap,shele} return join details. Assumes join is a
186 : // current local join. The values will be valid for the join
187 : // lifetime. mymap_{shmap_const,shele_const} are const correct
188 : // versions.
189 :
190 : void const * mymap_shmap_const( mymap_t const * join );
191 : void const * mymap_shele_const( mymap_t const * join );
192 :
193 : void * mymap_shmap( mymap_t * join );
194 : void * mymap_shele( mymap_t * join );
195 :
196 : // mymap_lock_{idx,ele0,ele1} specify the mapping between map
197 : // version lock indices and element store element indices. Assumes
198 : // join is a current local join and ele_idx / lock_idx is in
199 : // [0,ele_max) / is in [0,lock_cnt). mymap_lock_idx is the index
200 : // of the version lock that protects element store element ele_idx,
201 : // in [0,lock_cnt). [mymap_lock_ele0,mymap_lock_ele1) is the
202 : // contiguous range of elements protected by lock lock_idx. ele0
203 : // is in [0,ele_max), ele1 is in (0,ele_max], and ele0<ele1.
204 :
205 : ulong mymap_lock_idx ( mymap_t const * join, ulong ele_idx );
206 : ulong mymap_lock_ele0( mymap_t const * join, ulong lock_idx );
207 : ulong mymap_lock_ele1( mymap_t const * join, ulong lock_idx );
208 :
209 : // mymap_key_{eq,hash} expose the provided MAP_KEY_{EQ,HASH} macros
210 : // as inlines with strict semantics. They assume that the provided
211 : // pointers are in the caller's address space to keys that will not
212 : // be changed during the call. They retain no interest in any keys
213 : // on return.
214 : //
215 : // mymap_key_eq returns 1 if *k0 and *k1 are equal and 0 otherwise.
216 : //
217 : // mymap_key_hash returns the hash of *key using the hash function
218 : // seed. Should ideally be a random mapping from a MAP_KEY_T to a
219 : // ulong but this depends on what the user actually used for
220 : // MAP_KEY_HASH. The seed used by a particular mymap instance can
221 : // be obtained above.
222 :
223 : int mymap_key_eq ( ulong * k0, ulong * k1 );
224 : ulong mymap_key_hash( ulong * key, ulong seed );
225 :
226 : // mymap_backoff does FD_SPIN_PAUSE a random number of times. The
227 : // number of pauses is an approximately uniform IID random number
228 : // in [0,scale/2^16] where scale is in [0,2^32). Specifically, the
229 : // number of pauses is:
230 : //
231 : // floor( scale r / 2^48 )
232 : //
233 : // where r is a non-deterministic 32-bit uniform IID random number.
234 : // Under the hood, r is generated by hashing the user provided seed
235 : // and the least significant 32-bits of the CPU tickcounter.
236 : // Ideally, seed is a 32-bit globally unique identifier for the
237 : // logical thread of execution but this is up to the application to
238 : // specify and rarely matters in practice. This is a useful
239 : // building block for random exponential backoffs.
240 :
241 : void mymap_backoff( ulong scale, ulong seed );
242 :
243 : // mymap_query_memo returns the key_hash of the query associated
244 : // with the query's key to allow users to minimize potentially
245 : // expensive key hash computations in various operations.
246 : //
247 : // mymap_query_ele returns a pointer in the caller's address space
248 : // to the element store element associated with the query or a
249 : // sentinel value. The sentinel value is application dependent and
250 : // thus arbitrary (e.g. not necessarily in the element store,
251 : // including NULL, a local temporary used as a bit bucket, etc).
252 : // Assumes query is valid. The lifetime of the returned pointer
253 : // depends on the query. mymap_query_ele_const is a const correct
254 : // version.
255 :
256 : ulong mymap_query_memo ( mymap_query_t const * query );
257 : myele_t const * mymap_query_ele_const( mymap_query_t const * query );
258 : myele_t * mymap_query_ele ( mymap_query_t * query );
259 :
260 : // mymap_hint hints that the caller plans to do an operation
261 : // involving key soon. Assumes join is a current local join, key
262 : // points to a valid key in the caller's address space for the
263 : // duration of the call and query points to a local scratch to hold
264 : // info about the hint. Retains no interest in key. On return,
265 : // the query memo will be initialized.
266 : //
267 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_USE_HINT
268 : // is set, this will assume that query's memo is already
269 : // initialized for key (e.g. mostly useful for hashless
270 : // prefetching). If FD_MAP_FLAG_PREFETCH_META /
271 : // FD_MAP_FLAG_PREFETCH_DATA is set, this will issue a prefetch for
272 : // key's mymap metadata (i.e. lock version) / the element at the
273 : // start of key's probe sequence (i.e. the location of key or
274 : // contiguously shortly before it) FD_MAP_FLAG_PREFETCH combines
275 : // both for convenience. This can be used to overlap key access
276 : // latency with unrelated operations. All other flags are ignored.
277 :
278 : void
279 : mymap_hint( MAP_(t) const * join
280 : MAP_KEY_T const * key
281 : MAP_(query_t) * query,
282 : int flags );
283 :
284 : // mymap_prepare tries to start an insert/modify/blocking query
285 : // operation for key. Assumes join is a current local join, key
286 : // points to valid key in the caller's address space for the
287 : // duration of the call and query points to a local scratch to hold
288 : // the info about the prepare. Retains no interest in key.
289 : // Returns FD_MAP_SUCCESS (0) and an FD_MAP_ERR (negative) on
290 : // failure. This is a non-blocking fast O(1) (O(probe_max) worst
291 : // case) and supports highly concurrent operation.
292 : //
293 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
294 : // is set / clear in flags, this is allowed / not allowed to block
295 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
296 : // query's memo is already initialized for key. This can be used
297 : // to avoid redundant expensive key hashing when prefetching. All
298 : // other flags are ignored (the upper 26-bits of flags can be used
299 : // to provide a local seed for random backoffs but this is up to
300 : // the application and rarely matters in practice).
301 : //
302 : // On success, the caller is in a prepare for key, query is
303 : // initialized with info about prepare (including query's memo
304 : // initialized for key). ele=mymap_query_ele(query) gives the
305 : // location in the caller's address space to an element store
306 : // element for the prepare that will be stable for the duration of
307 : // the prepare and memo=mymap_query_memo(query) gives the key hash.
308 : //
309 : // If the element is marked as free, key is not in the map and ele
310 : // is where key could be inserted. If the caller is inserting key,
311 : // the caller should populate element's key with key, element's
312 : // memo (if any) with memo (avoids having to rehash the key), mark
313 : // ele as used and then do a mymap_publish to complete the insert.
314 : // If not, the caller should keep ele marked as free and do a
315 : // mymap_cancel to complete the prepare (doesn't matter from the
316 : // map's point-of-view if anything else was clobbered /
317 : // mymap_publish would also work here).
318 : //
319 : // If the element is marked as used, key is in the map at ele. If
320 : // the caller is modifying key's value, the caller should do the
321 : // modification and then mymap_publish to complete the modify. If
322 : // not (e.g. blocking query), the caller can inspect ele contents
323 : // and the mymap_cancel to complete the blocking query
324 : // (mymap_publish would also work here). In both cases, the caller
325 : // should not modify ele's key, modify ele's memo, or mark ele as
326 : // free. Note that mymap_publish must be used even if the
327 : // modifications were only temporary.
328 : //
329 : // On failure, the caller is not in a prepare for key, query
330 : // ele==sentinel and query memo will be initialized for key.
331 : / Reasons for failure:
332 : //
333 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
334 : // progress at some point during the call. Try again later (e.g.
335 : // after a random exponential backoff). Never returned on a
336 : // blocking call.
337 : //
338 : // - FD_MAP_ERR_FULL: key was not in the map but inserting ele
339 : // would require making a probe sequence longer than probe_max.
340 : // Try again when the map is less full (e.g. after removing some
341 : // elements).
342 : //
343 : // mymap_publish ends the prepare described by query, updating the
344 : // map version to reflect changes made during the prepare. Assumes
345 : // query is valid and describes an active prepare. Cannot fail and
346 : // will not be in the prepare will finished on return (query's memo
347 : // will still be intialized for key). This is a generally safe way
348 : // to end a prepare even if the caller did not modify the map
349 : // during the prepare.
350 : //
351 : // mymap_cancel ends the prepare described by query, reverting the
352 : // map version to reflect that the caller did not change the map
353 : // during the prepare. Assumes query is valid and describes an
354 : // active prepare and that the caller did not make any meaningful
355 : // modifications to the map during the prepare (note that temporary
356 : // changes during the prepare can be considered modifications as
357 : // per the above). Cannot fail and will not be in the prepare will
358 : // finished on return (query's memo will still be initialized
359 : // for key). This is a safe way to end a prepare only if the
360 : // caller did not modify the map during the prepare.
361 : //
362 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
363 : // remove or queries for the same map on the same thread.
364 : //
365 : // IMPORTANT SAFETY TIP! A successful prepare must have a matching
366 : // publish or cancel (and then ideally as soon as possible).
367 : //
368 : // IMPORTANT SAFETY TIP! The order in which keys that hash to the
369 : // same slot were inserted into the map is preserved for the
370 : // lifetime of the keys. Thus the hash function used can be
371 : // constructed to create ordered iterators over groups of keys.
372 :
373 : int mymap_prepare( mymap_t * join, ulong const * key, myele_t * sentinel, mymap_query_t * query, int flags );
374 : void mymap_publish( mymap_query_t * query );
375 : void mymap_cancel ( mymap_query_t * query );
376 :
377 : // mymap_remove removes key from the mymap. Assumes join is a
378 : // current local join and key is valid for the duration of the
379 : // call. Retains no interest in key. This is non-blocking fast
380 : // typically O(1) and supports highly concurrent operation.
381 : //
382 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
383 : // is set / clear in flags, this is allowed / not allowed to block
384 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
385 : // query's memo is already initialized for key. This can be used
386 : // to avoid redundant expensive key hashing when prefetching. If
387 : // clear, query is ignored and can be set NULL. All other flags
388 : // are ignored (the upper 26-bits of flags can be used to provide a
389 : // local seed for random backoffs but this is up to the application
390 : // and rarely matters in practice).
391 : //
392 : // Returns FD_MAP_SUCCESS (0) on success and an FD_MAP_ERR
393 : // (negative) on failure. On success, key's mapping was removed at
394 : // some point during the call. On failure, no changes were made by
395 : // this call and:
396 : //
397 : // - FD_MAP_ERR_KEY: Key was not found in the mymap at some point
398 : // during the call.
399 : //
400 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
401 : // progress at some point during the call. Same considerations
402 : // as prepare above. Never returned on a blocking call.
403 : //
404 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
405 : // remove or queries for the same map on the same thread.
406 :
407 : int mymap_remove( mymap_t * join, ulong const * key, mymap_query_t const * query, int flags );
408 :
409 : // mymap_query_try tries to speculatively query a mymap for key.
410 : // On return, query will hold information about the try (including
411 : // query's memo initialized for key). sentinel gives the query
412 : // element pointer value (arbitrary) to pass through when it is not
413 : // safe to try the query. Assumes join is a current local join and
414 : // key is valid for the duration of the call. Does not modify the
415 : // mymap and retains no interest in key, sentinel or query. This
416 : // is a non-blocking fast O(1) (O(probe_max) worst case) and
417 : // supports highly concurrent operation.
418 : //
419 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
420 : // is set / clear in flags, this is allowed / not allowed to block
421 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
422 : // query's memo is already initialized for key. This can be used
423 : // to avoid redundant expensive key hashing when prefetching. All
424 : // other flags are ignored (the upper 26-bits of flags can be used
425 : // to provide a local seed for random backoffs but this is up to
426 : // the application and rarely matters in practice).
427 : //
428 : // Returns FD_MAP_SUCCESS (0) on success and an FD_MAP_ERR
429 : // (negative) on failure. On success, key mapped to the element
430 : // store element mymap_query_ele( query ) in the caller's address
431 : // space at some point during the call. The mymap retains
432 : // ownership of this element but the caller can zero copy
433 : // speculatively process the element's contents. On failure,
434 : // mymap_query_ele( query ) will be sentinel and returns:
435 : //
436 : // - FD_MAP_ERR_KEY: Key was not found in the mymap in some point
437 : // during the call.
438 : //
439 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
440 : // progress at some point during the call. Try again later (e.g.
441 : // after a random exponential backoff). Unlike prepare and
442 : // remove, this call does _not_ require any locks for key's probe
443 : // sequence. As such, AGAIN can only be caused by concurrent
444 : // prepare/remove operations and this will never interfere with
445 : // any other concurrent operation. Among the many implications,
446 : // a query will never delay a concurrent query and AGAIN will
447 : // never be returned if only concurrent speculative queries are
448 : // in progress. Never returned on a blocking call.
449 : //
450 : // IMPORTANT SAFETY TIP! THE CALLER SHOULD BE PREPARED TO HANDLE
451 : // ARBITRARY AND/OR INCONSISTENT VALUES FOR ELEMENT FIELDS DURING
452 : // SPECULATIVE PROCESSING. CALLERS SHOULD NOT COMMIT ANY RESULTS
453 : // OF SPECULATIVE PROCESSING UNTIL IT TESTS THE QUERY WAS
454 : // SUCCESSFUL.
455 : //
456 : // The simplest form of speculative processing is to copy the
457 : // element store element into a local temporary, test that the
458 : // speculation was valid, and then process the local temporary copy
459 : // at its leisure. Zero copy, more selective copying and/or
460 : // writing speculative results into local temporaries are more
461 : // advanced examples of speculative processing.
462 : //
463 : // Use mymap_prepare to do a blocking (non-speculative) query.
464 : //
465 : // mymap_query_test tests if an in-progress query is still valid.
466 : // Assumes query is valid, we are still in a query try and lock
467 : // version numbers have not wrapped since we started the try.
468 : // Returns FD_MAP_SUCCESS (0) if the query is still valid and
469 : // FD_MAP_ERR_AGAIN (negative) if a potentially conflicting
470 : // operation was in progress at some point during the try.
471 : //
472 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
473 : // remove or queries for the same map on the same thread.
474 :
475 : int
476 : mymap_query_try( mymap_t const * join,
477 : ulong const * key,
478 : myele_t const * sentinel,
479 : mymap_query_t * query,
480 : int flags );
481 :
482 : int mymap_query_test( mymap_query_t const * query );
483 :
484 : // mymap_lock_range tries to acquire locks lock_idx for lock_idx in
485 : // [range_start,range_start+range_cnt) (cyclic).
486 : //
487 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
488 : // is set / clear in flags, this is allowed / not allowed to block
489 : // the caller. If FD_MAP_FLAG_RDONLY is set, the caller promises
490 : // to only read the elements covered by the range while holding the
491 : // locks. All other flags are ignored (the upper 26-bits of flags
492 : // can be used to provide a local seed for random backoffs but this
493 : // is up to the application and rarely matters in practice).
494 : //
495 : // Returns FD_MAP_SUCCESS (0) on success and FD_MAP_ERR_AGAIN
496 : // (negative) if there was a potentially conflicting operation in
497 : // progress at some point during the call. On success,
498 : // version[lock_idx] will hold the version to use when releasing
499 : // that lock. On failure, version may have been clobbered. AGAIN
500 : // is never returned if BLOCKING is set.
501 : //
502 : // mymap_unlock_range unlocks a similarly specified range. Assumes
503 : // caller has the locks and version[lock_idx] is the value set when
504 : // locked was obtained.
505 : //
506 : // These both assume join is a current local join, range_start is
507 : // in [0,lock_cnt), range_cnt is in [0,lock_cnt] and version is
508 : // valid with space for lock_cnt entries (YES ... LOCK_CNT, NOT
509 : // RANGE_CNT ... this is trivial with a compile time stack
510 : // temporary as lock_cnt<=MAP_LOCK_MAX).
511 :
512 : int
513 : mymap_lock_range( mymap_t * join,
514 : ulong range_start,
515 : ulong range_cnt,
516 : int flags,
517 : ulong * version );
518 :
519 : void
520 : mymap_unlock_range( mymap_t * join,
521 : ulong range_start,
522 : ulong range_cnt,
523 : ulong const * version );
524 :
525 : // The mymap_iter_* APIs are used to iterate over all keys inserted
526 : // into the map with the same memo (to support grouping of keys by
527 : // key hash value). The iteration order will be from the least
528 : // recently inserted to most recently inserted. flags has similar
529 : // meaning as other APIs. Example usage:
530 : //
531 : // ulong memo = ... hash of keys to iterate over ...
532 : //
533 : // mymap_iter_t iter[1];
534 : // int err = mymap_iter_init( join, memo, 0, iter );
535 : //
536 : // if( FD_UNLIKELY( err ) ) {
537 : //
538 : // ... At this point, err is FD_MAP_ERR_AGAIN and caller has
539 : // ... ownership of iter. There was a potentially conflicting
540 : // ... prepare or remove in progress at some point during the
541 : // ... call. We can try again later (e.g. after a random
542 : // ... backoff or doing other non-conflicting work).
543 : // ... mymap_iter_done will be 1, mymap_iter_fini will be a
544 : // ... no-op. Never returned if mymap_iter_init flags has
545 : // ... FD_MAP_FLAG_BLOCKING set.
546 : //
547 : // } else {
548 : //
549 : // ... At this point, we are in an iteration and iteration has
550 : // ... ownership of iter.
551 : //
552 : // while( !mymap_iter_done( iter ) ) {
553 : // myele_t * ele = mymap_iter_ele( iter );
554 : //
555 : // ... At this point, mymap_key_hash( ele->key, seed ) == memo (==ele's memo if memoized)
556 : //
557 : // ... process ele here.
558 : //
559 : // ... IMPORTANT! Generally speaking, it is not okay to
560 : // ... insert, remove, modify, blocking read, non-blocking
561 : // ... read here. It is okay to read ele and modify any
562 : // ... value fields though. If mymap_iter_init flags had
563 : // ... FD_MAP_FLAG_RDONLY set, caller promises it is only
564 : // ... reading ele here.
565 : //
566 : // mymap_iter_next( iter );
567 : // }
568 : //
569 : // mymap_iter_fini( iter );
570 : //
571 : // ... At this point, we are not in an iteration and caller has
572 : // ... ownership of iter.
573 : //
574 : // }
575 :
576 : int mymap_iter_init( mymap_t * join, ulong memo, int flags, mymap_iter_t * lmem );
577 : int mymap_iter_done( mymap_iter_t * iter );
578 : myele_t * mymap_iter_ele ( mymap_iter_t * iter );
579 : mymap_iter_t * mymap_iter_next( mymap_iter_t * iter );
580 : mymap_iter_t * mymap_iter_fini( mymap_iter_t * iter );
581 :
582 : // mymap_verify returns FD_MAP_SUCCESS (0) if the join, underlying
583 : // map and underlying element store give a mapping of unique keys
584 : // to unique elements in the element store with a bounded maximum
585 : // probe length. Returns FD_MAP_ERR_INVAL (negative) otherwise (no
586 : // changes by this call, logs details). Assumes that caller has
587 : // all the map locks and/or the map is otherwise known to be idle.
588 :
589 : int mymap_verify( mymap_t const * join );
590 :
591 : // mymap_strerror converts an FD_MAP_SUCCESS / FD_MAP_ERR code into
592 : // a human readable cstr. The lifetime of the returned pointer is
593 : // infinite. The returned pointer is always to a non-NULL cstr.
594 :
595 : char const * mymap_strerror( int err );
596 :
597 : Do this as often as desired in a compilation unit to get different
598 : types of concurrent maps. Options exist for generating library
599 : header prototypes and/or library implementations for concurrent maps
600 : usable across multiple compilation units. Additional options exist
601 : to use different hashing functions, key comparison functions, etc as
602 : detailed below.
603 :
604 : IMPORTANT SAFETY TIP! If using a key sentinel, prepare/remove/query
605 : operations assume the input key is not the key sentinel (i.e. a
606 : sentinel is not considered a "valid key). Sentinel keys are not
607 : necessary if MAP_ELE_IS_FREE, MAP_ELE_FREE and MAP_ELE_MOVE are set
608 : appropriately.
609 :
610 : To better understand prepare / publish / cancel semantics:
611 :
612 : mykey_t * key = ... key to insert / modify / blocking query
613 :
614 : mymap_query_t query[1];
615 : int err = mymap_prepare( map, key, sentinel, query, 0 );
616 : myele_t * ele = mymap_query_ele ( query );
617 : ulong memo = mymap_query_memo( query );
618 :
619 : ... At this point, memo == mymap_key_hash( key, seed )
620 :
621 : if( FD_UNLIKELY( err ) ) {
622 :
623 : ... At this point, we are not in a prepare for key and
624 : ... ele==sentinel.
625 :
626 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
627 : ... conflicting prepare or remove in progress at some point
628 : ... during the call. We can try again later (e.g. after a
629 : ... random backoff or doing other non-conflicting work).
630 : ... Never returned for a blocking call.
631 :
632 : ... If err is FD_MAP_ERR_FULL, key was not in the map but
633 : ... inserting it would have created a key probe sequence longer
634 : ... than probe_max at some point during the call. We can try
635 : ... again later when it is less full (e.g. after removing keys
636 : ... from the map).
637 :
638 : } else if( ... ele is marked as free ... ) ) {
639 :
640 : ... At this point, we are in a prepare for key, key is not in
641 : ... the map and ele points in the caller's address space to free
642 : ... element in the element store suitable for holding key.
643 :
644 : ... initialize ele here, including populating ele's key with key
645 : ... and (if memoized) populating ele's memo with memo.
646 :
647 : if( ... we decided not to insert key ... ) mymap_cancel( query ); // "failed insert"
648 : else {
649 : ... mark ele as used
650 : mymap_publish( query ); // "insert"
651 : }
652 :
653 : } else {
654 :
655 : ... At this point, we are in a prepare for key, key is in the map
656 : ... and ele points in the caller's address space to the element
657 : ... store element that currently contains key. We are free to
658 : ... modify ele's value. We should not modify ele's key, modify
659 : ... ele's memo (if memoized) or mark ele as free.
660 :
661 : ... process ele here
662 :
663 : if( ... we didn't modify ele ... ) mymap_cancel ( query ); // "blocking query"
664 : else mymap_publish( query ); // "modify"
665 :
666 : }
667 :
668 : To better understand remove semantics:
669 :
670 : mykey_t * key = ... key to remove
671 :
672 : int err = mymap_remove( map, key, NULL, 0 );
673 :
674 : if( FD_UNLIKELY( err ) ) {
675 :
676 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
677 : ... conflicting prepare or remove in progress at some point
678 : ... during the call. We can try again later (e.g. after a random
679 : ... backoff or doing other non-conflicting work).
680 : ... Never returned for a blocking call.
681 :
682 : ... If err is FD_MAP_ERR_KEY, key was not in the map at some
683 : ... point during the call (so remove did not do anything).
684 :
685 : } else {
686 :
687 : ... key was removed from the map at some point during the call.
688 : ... The remove might have shuffled other keys. This shuffling
689 : ... can only decrease probe sequence length for any remaining
690 : ... keys and preserves insertion ordering for keys with the same
691 : ... hash (or initial probe element more generally).
692 :
693 : }
694 :
695 : To better understand query semantics:
696 :
697 : mykey_t * key = ... key to query
698 :
699 : mymap_query_t query[1];
700 : int err = mymap_query_try( join, key, sentinel, query, 0 );
701 : myele_t const * ele = mymap_query_ele_const( query );
702 : ulong memo = mymap_query_memo ( query );
703 :
704 : ... At this point, memo==mymap_key_hash( key, seed )
705 :
706 : if( FD_UNLIKELY( err ) ) {
707 :
708 : ... At this point, ele==sentinel
709 : ...
710 : ... If err is FD_MAP_ERR_KEY, key was not in the mymap at some
711 : ... point during the try.
712 : ...
713 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
714 : ... conflicting operation in progress during the try and we can
715 : ... try again later (e.g. after a random backoff or doing other
716 : ... non-conflicting work).
717 :
718 : } else {
719 :
720 : ... At this point, ele points in our address space to an element
721 : ... in the element store (non-NULL) and ele->key matched key at
722 : ... some point during the try.
723 :
724 : ... Speculatively process ele here.
725 : ...
726 : ... DO NOT TRUST ANY RESULTS OF THIS SPECULATIVE PROCESSING YET.
727 : ... THERE IS NO GUARANTEE YET THAT A CONCURRENT USER HASN'T
728 : ... CHANGED THE MYMAP IN A WAY THAT COULD YIELD ARBITRARY AND
729 : ... INCONSISTENT RESULTS.
730 : ...
731 : ... The simplest and most common form of speculative processing
732 : ... is to copy the needed portions of ele into local stack temps.
733 : ...
734 : ... Note: concurrent operations include removing key from the
735 : ... mymap (and maybe multiple cycles of inserting and removing it
736 : ... and then at potentially different element store locations) or
737 : ... unrelated removes potentially shuffling the location of this
738 : ... key. That's not an issue practically as the ele pointer here
739 : ... will be to an element compatible memory region that will
740 : ... continue to exist regardless and we shouldn't be trusting any
741 : ... query reads yet (the query test will detect if these can be
742 : ... trusted). See rant in fd_map_chain_para.c for more details.
743 :
744 : ... At this point, we are done with speculative processing (or we
745 : ... don't want to do any more speculative processing if the try
746 : ... has already failed).
747 :
748 : err = mymap_query_test( query );
749 : if( FD_UNLIKELY( err ) ) {
750 :
751 : ... At this point, err will be FD_MAP_ERR_AGAIN and a
752 : ... potentially conflicting operation in the try was detected
753 : ... by the test.
754 :
755 : ... Application specific handling here (e.g. try again after a
756 : ... random backoff or doing other non-conflicting work).
757 :
758 : } else {
759 :
760 : ... At this point, the results of the speculation thus far can
761 : ... be trusted and can be considered to have been computed at
762 : ... some point in time between try and test.
763 :
764 : }
765 : }
766 :
767 : Example use of lock_range / unlock (do a parallel snapshot of an
768 : entire map at a globally well defined point in time with minimal
769 : interference to ongoing concurrent modifications):
770 :
771 : ulong version[ mymap_lock_max() ];
772 :
773 : ulong lock_cnt = mymap_lock_cnt( join );
774 :
775 : mymap_lock_range( join, 0, lock_cnt, FD_MAP_FLAGS_BLOCKING | FD_MAP_FLAGS_RDONLY, version );
776 :
777 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ ) { ... parallelize this loop over snapshot threads as desired
778 : ulong ele0 = mymap_lock_ele0( join, lock_idx );
779 : ulong ele1 = mymap_lock_ele1( join, lock_idx );
780 :
781 : ... process element store elements [ele0,ele1) here
782 :
783 : mymap_unlock_range( join, lock_idx, 1UL, version );
784 : }
785 :
786 : Note that mymap_lock_range in this example might block the caller for
787 : a long time if the map is under heavy concurrent modification. To
788 : prioritize the snapshotting over these operations, the same API can
789 : be used to prioritize the snapshot over ongoing concurrent
790 : modifications:
791 :
792 : ulong version[ mymap_lock_max() ];
793 :
794 : ulong lock_cnt = mymap_lock_cnt( join );
795 :
796 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ )
797 : mymap_lock_range( join, lock_idx, 1UL, FD_MAP_FLAGS_BLOCKING | FD_MAP_FLAGS_RDONLY, version );
798 :
799 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ ) { ... parallelize this loop over snapshot threads as desired
800 : ulong ele0 = mymap_lock_ele0( join, lock_idx );
801 : ulong ele1 = mymap_lock_ele1( join, lock_idx );
802 :
803 : ... process element store elements [ele0,ele1) here
804 :
805 : mymap_unlock_range( join, lock_idx, 1UL, version );
806 : }
807 :
808 : Implementation overview:
809 :
810 : A map is basically a persistent shared array of version numbers
811 : named lock. lock[ lock_idx ] contains a version number that covers
812 : map slots [lock_idx(ele_max/lock_cnt),(lock_idx+1)(ele_max/lock_cnt)).
813 :
814 : When trying an operation that could impact probe sequences passing
815 : through a lock's range of slots, the version number is atomically
816 : incremented. It is incremented again at completion. It may also
817 : be decremented if the operation didn't end up modifying any of the
818 : covered slots.
819 :
820 : Thus, an {odd,even} version number indicates that there is {a
821 : potential,not any} operation in progress that could impact probe
822 : sequences passing through the corresponding slots. The most
823 : significant bits of the version number can be used for lockfree
824 : style operations to detect changes to any probe sequences they use.
825 :
826 : When the map is not overloaded, key probe sequences are typically
827 : O(1) long and, in general, at most a (user configured) probe_max
828 : long. Since a version number covers many slots typically, this
829 : implies that the typical "read" operation (e.g.
830 : query_try/query_test) looks like:
831 :
832 : - try: observe lock version numbers covering all slots in key's
833 : probe sequence, fail if any locked (typically 1 normal read
834 : that hits L1/L2 cache, especially in the common case of
835 : reads more frequent than writes)
836 : - spec: speculatively process the element containing key
837 : - test: check version numbers haven't changed (typically 1 normal
838 : read that is an even more likely L1/L2 cache hit), fail if
839 : any changed
840 :
841 : And the typical "write" operation (e.g. prepare/publish) looks
842 : like:
843 :
844 : - prepare: increment lock version numbers covering all slots in
845 : key's probe sequence, fail if any locked (typically 1
846 : atomic fetch-and-or done test-and-test-and-set style to
847 : minimize hardware NOC contention)
848 : - exec: (non-speculatively) process the element containing key
849 : - publish: increment version numbers (typically 1 normal read/write
850 : that hits L1/L2 cache)
851 :
852 : Readers never block concurrent readers or writers. Writers can
853 : block other readers and other writers. If there are many more
854 : version locks than concurrent writers though, writers are unlikely
855 : to interfere with concurrent readers or writers. In all cases, all
856 : map operations are serializable.
857 :
858 : For maps that are loaded to their capacity, probe sequences could
859 : be up to probe_max long and probe_max might be quite large. This
860 : implies that more than one version lock might be needed. Since
861 : this range is cyclic contiguous in memory, the locking operations
862 : are nice compact streaming access patterns. And similarly for the
863 : element store access patterns. */
864 :
865 : #include "fd_map.h"
866 :
867 : /* MAP_NAME gives the API prefix to use for map */
868 :
869 : #ifndef MAP_NAME
870 : #error "Define MAP_NAME"
871 : #endif
872 :
873 : /* MAP_ELE_T is the map element type */
874 :
875 : #ifndef MAP_ELE_T
876 : #error "Define MAP_ELE_T"
877 : #endif
878 :
879 : /* MAP_KEY_T is the map key type */
880 :
881 : #ifndef MAP_KEY_T
882 : #define MAP_KEY_T ulong
883 : #endif
884 :
885 : /* MAP_KEY is the MAP_ELE_T key field */
886 :
887 : #ifndef MAP_KEY
888 5516805 : #define MAP_KEY key
889 : #endif
890 :
891 : /* MAP_KEY_EQ returns 0/1 if *k0 is the same/different as *k1 */
892 :
893 : #ifndef MAP_KEY_EQ
894 0 : #define MAP_KEY_EQ(k0,k1) ((*(k0))==(*(k1)))
895 : #endif
896 :
897 : /* MAP_KEY_HASH returns a random mapping of *key into ulong. The
898 : mapping is parameterized by the 64-bit ulong seed. */
899 :
900 : #ifndef MAP_KEY_HASH
901 : #define MAP_KEY_HASH(key,seed) fd_ulong_hash( (*(key)) ^ (seed) )
902 : #endif
903 :
904 : /* If MAP_MEMOIZE is defined to non-zero, elements have a field that
905 : can be used while in the map to hold the MAP_KEY_HASH for an
906 : element's key. This is useful for accelerating user code that might
907 : need a hash and various map operations. */
908 :
909 : #ifndef MAP_MEMOIZE
910 : #define MAP_MEMOIZE 0
911 : #endif
912 :
913 : /* If MAP_MEMOIZE is non-zero, MAP_MEMO is the memo element field.
914 : Should be a ulong. Like MAP_KEY and MAP_NEXT, when an element is in
915 : the map, this value is managed by the map and will contain the
916 : MAP_KEY_HASH of the element's key and the map's seed. */
917 :
918 : #ifndef MAP_MEMO
919 : #define MAP_MEMO memo
920 : #endif
921 :
922 : /* If MAP_MEMOIZE is defined to non-zero, a non-zero MAP_KEY_EQ_IS_SLOW
923 : indicates the MAP_MEMO field should be used to accelerate MAP_KEY_EQ
924 : operations. This is useful when MAP_KEY_EQ is non-trivial (e.g.
925 : variable length string compare, large buffer compares, etc). */
926 :
927 : #ifndef MAP_KEY_EQ_IS_SLOW
928 : #define MAP_KEY_EQ_IS_SLOW 0
929 : #endif
930 :
931 : /* MAP_ELE_IS_FREE returns 0/1 if the slot pointed to by ele in the
932 : caller's address space contains / does not contain a key-value pair.
933 : The implementation can assume ele is valid and that it is safe to
934 : speculate on ele. The default implementation tests if key is not 0.
935 : If using a different key sentinel or not using a key sentinel, update
936 : this appropriately. */
937 :
938 : #ifndef MAP_ELE_IS_FREE
939 0 : #define MAP_ELE_IS_FREE(ctx,ele) (!((ele)->MAP_KEY))
940 : #endif
941 :
942 : /* MAP_ELE_FREE frees the key-value pair in the slot pointed to by ele
943 : in the caller's address space. The implementation can assume ele is
944 : valid, ele contains a key-value pair on entry and there will be no
945 : concurrent operations on ele during the free. The default
946 : implementation sets key to 0. If using a different key sentinel or
947 : not using a key sentinel, update this appropriately. Likewise, if
948 : not using plain-old-data keys and values, this should do the
949 : appropriate resource management. The join ctx is provided to
950 : facilitate this. */
951 :
952 : #ifndef MAP_ELE_FREE
953 0 : #define MAP_ELE_FREE(ctx,ele) do (ele)->MAP_KEY = (MAP_KEY_T)0; while(0)
954 : #endif
955 :
956 : /* MAP_ELE_MOVE moves the key-value pair in slot src to slot dst.
957 : src and dst are in the caller's address space. The implementation
958 : can assume src and dst are valid, dst does not contain a key-value
959 : pair on entry, src contains a key-value on pair on entry, and there
960 : will be no concurrent operations on src and dst during the move. The
961 : default implementation shallow copies src to dst and sets src key to
962 : 0. If using a different key sentinel or not using a key sentinel,
963 : update this appropriately. Likewise, if elements do not use
964 : plain-old-data keys and/or values, this should do the appropriate key
965 : and/or value resource management. The join ctx is provided to
966 : facilitate this. */
967 :
968 : #ifndef MAP_ELE_MOVE
969 0 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)0; } while(0)
970 : #endif
971 :
972 : /* MAP_CTX_MAX specifies the maximum number of bytes of user context
973 : for use in MAP_ELE above (e.g. custom allocators / workspaces / local
974 : pointers to additional value arrays / etc). This context will be
975 : ulong aligned. Default is up to 72 bytes. */
976 :
977 : #ifndef MAP_CTX_MAX
978 3 : #define MAP_CTX_MAX (72UL)
979 : #endif
980 :
981 : /* MAP_VERSION_T gives the map version index type. Should be a
982 : primitive unsigned integer type. The least significant bit is used
983 : to indicate whether or not a slot could be impacted by an in progress
984 : map operation. The remaining bits indicate the version number. The
985 : default is ulong, yielding effectively infinite ABA protection (e.g.
986 : a lockfree query operation would need to be stalled for over ~2^63
987 : concurrent insert/modify/remove map operations before risk of getting
988 : confused by version number reuse ... which would take millennia for
989 : modern hardware practically). Narrow types yield less metadata
990 : footprint overhead for the map and lower ABA protection. (For human
991 : hard real-time applications, uint is probably fine and, in for
992 : computer hard computer real-time applications, ushort and/or uchar
993 : could be fine). */
994 :
995 : #ifndef MAP_VERSION_T
996 104570463 : #define MAP_VERSION_T ulong
997 : #endif
998 :
999 : /* MAP_LOCK_MAX gives the maximum number of version locks the map can
1000 : support. This should be positive and an integer power-of-two. This
1001 : essentially is limit on the maximum number of concurrent operations
1002 : and thus should be much greater than the number of concurrent
1003 : insert/modify/remove operations in expected map usage. Default is
1004 : 1024.
1005 :
1006 : Note that this is not theoretically required for the below
1007 : implementation. This exists to compile time bound stack utilization
1008 : of prepare/remove/query_try. That is,
1009 : sizeof(MAP_VERSION_T)*MAP_LOCK_MAX should be a L1D cache / L2D cache
1010 : / stack allocation friendly footprint (defaults yield 8 KiB).
1011 : MAP_LOCK_MAX could be removed by using an dynamic stack allocation
1012 : but that would limit this to targets with FD_HAS_ALLOCA. Could also
1013 : be eliminated by using a dynamic footprint lock cache in the query
1014 : structure, join structures and/or combining the query and join
1015 : structures. These are cumbersome for the user and the last two add
1016 : restrictions to intra-process multithreaded usage not seen in other
1017 : FD persistent inter-process datastructures. (Consider using a
1018 : massive/reasonable MAP_LOCK_MAX when FD_HAS_ALLOCA is set/clear and
1019 : then using alloca in prepare/remove/query_try when FD_HAS_ALLOCA is
1020 : set? Almost the best of both worlds but does imply some subtle
1021 : restrictions if trying to interoperate between targets compiled with
1022 : different features ... avoiding for now.) */
1023 :
1024 : #ifndef MAP_LOCK_MAX
1025 3000045 : #define MAP_LOCK_MAX (1024)
1026 : #endif
1027 :
1028 : /* MAP_ALIGN gives the alignment required for the map shared memory.
1029 : Default is 128 for double cache line alignment. Should be at least
1030 : ulong alignment. */
1031 :
1032 : #ifndef MAP_ALIGN
1033 : #define MAP_ALIGN (128UL)
1034 : #endif
1035 :
1036 : /* MAP_MAGIC gives the shared memory magic number to aid in persistent
1037 : and/or interprocess usage. */
1038 :
1039 : #ifndef MAP_MAGIC
1040 3 : #define MAP_MAGIC (0xf17eda2c37c5107UL) /* firedancer cslot version 0 */
1041 : #endif
1042 :
1043 : /* MAP_IMPL_STYLE controls what to generate:
1044 : 0 - header only library
1045 : 1 - library header declaration
1046 : 2 - library implementation */
1047 :
1048 : #ifndef MAP_IMPL_STYLE
1049 : #define MAP_IMPL_STYLE 0
1050 : #endif
1051 :
1052 : /* Implementation *****************************************************/
1053 :
1054 : #if MAP_IMPL_STYLE==0 /* local use only */
1055 : #define MAP_STATIC FD_FN_UNUSED static
1056 : #else /* library header and/or implementation */
1057 : #define MAP_STATIC
1058 : #endif
1059 :
1060 128790720 : #define MAP_(n) FD_EXPAND_THEN_CONCAT3(MAP_NAME,_,n)
1061 :
1062 : #if MAP_IMPL_STYLE!=2 /* need header */
1063 :
1064 : #include "../bits/fd_bits.h"
1065 :
1066 : struct __attribute__((aligned(MAP_ALIGN))) MAP_(shmem_private) {
1067 :
1068 : /* This point is MAP_ALIGN aligned */
1069 :
1070 : ulong magic; /* ==MAP_MAGIC */
1071 : ulong ele_max; /* Element store capacity, positive and an integer power-of-two */
1072 : ulong lock_cnt; /* Number of locks, positive and an integer power-of-two <= min( ele_max, MAP_LOCK_MAX ) */
1073 : ulong probe_max; /* Maximum length probe sequence, in [1,ele_max] */
1074 : ulong seed; /* Key hash seed, arbitrary */
1075 : int lock_shift; /* log2( ele_max / lock_cnt ), non-negative */
1076 :
1077 : /* Padding to MAP_ALIGN alignment here */
1078 :
1079 : /* MAP_VERSION_T lock[ lock_cnt ] here (obligatory sigh about lagging
1080 : C++ support for 0 sized structure array footers). */
1081 :
1082 : /* Padding to MAP_ALIGN alignment here */
1083 : };
1084 :
1085 : typedef struct MAP_(shmem_private) MAP_(shmem_t);
1086 :
1087 : struct MAP_(private) {
1088 : MAP_ELE_T * ele; /* Location of the element store in the local address space, indexed [0,ele_max) */
1089 : MAP_VERSION_T * lock; /* Location of the lock versions in the local address space, indexed [0,lock_cnt) */
1090 : ulong ele_max; /* ==shmem->ele_max */
1091 : ulong lock_cnt; /* ==shmem->lock_cnt */
1092 : ulong probe_max; /* ==shmem->probe_max */
1093 : ulong seed; /* ==shmem->seed */
1094 : int lock_shift; /* ==shmem->lock_shift */
1095 : int _pad; /* padding to ulong alignment */
1096 : uchar ctx[ MAP_CTX_MAX ]; /* User context for MAP_ELE_IS_FREE/MAP_ELE_FREE/MAP_ELE_MOVE */
1097 : };
1098 :
1099 : typedef struct MAP_(private) MAP_(t);
1100 :
1101 : struct MAP_(query_private) {
1102 : ulong memo; /* Query key memo */
1103 : MAP_ELE_T * ele; /* Query element in the local address space */
1104 : MAP_VERSION_T * l; /* Lock needed for this query in the local address space */
1105 : MAP_VERSION_T v; /* Version of lock at query start */
1106 : };
1107 :
1108 : typedef struct MAP_(query_private) MAP_(query_t);
1109 :
1110 : struct MAP_(iter_private) {
1111 : MAP_ELE_T * ele; /* Location of the element store in the local address space, indexed [0,ele_max) */
1112 : MAP_VERSION_T * lock; /* Location of the lock versions in the local address space, indexed [0,lock_cnt) */
1113 : ulong ele_max; /* ==shmem->ele_max */
1114 : ulong lock_cnt; /* ==shmem->lock_cnt */
1115 : ulong seed; /* ==shmem->seed */
1116 : ulong memo; /* matching memo for iteration */
1117 : ulong ele_idx; /* If ele_rem>0, current matching element, ignored otherwise */
1118 : ulong ele_rem; /* Number of elements remaining to probe, in [0,probe_max] */
1119 : ulong version_lock0; /* Index of first lock used by this iter, in [0,lock_cnt] */
1120 : ulong version_cnt; /* Number of locks used by this iter, in [0,lock_cnt] (typically 1) */
1121 : MAP_VERSION_T version[ MAP_LOCK_MAX ]; /* Direct mapped cache of version numbers for unlock */
1122 : };
1123 :
1124 : typedef struct MAP_(iter_private) MAP_(iter_t);
1125 :
1126 : FD_PROTOTYPES_BEGIN
1127 :
1128 : /* map_private_try returns the version of the lock observed at some
1129 : point during the call. Assumes lock is valid. If the least
1130 : significant bit of the returned value is set (i.e. is odd), an
1131 : operation was in progress on a key whose probe sequence includes a
1132 : map slot covered by this lock (i.e. it is not a good time to try the
1133 : operation). If the LSB is clear (i.e. is even), no operation was in
1134 : progress (i.e. it is a good time to try). This is a compiler memory
1135 : fence. */
1136 :
1137 : static inline MAP_VERSION_T
1138 8136078 : MAP_(private_try)( MAP_VERSION_T volatile const * l ) {
1139 8136078 : MAP_VERSION_T v;
1140 8136078 : FD_COMPILER_MFENCE();
1141 8136078 : v = *l;
1142 8136078 : FD_COMPILER_MFENCE();
1143 8136078 : return v;
1144 8136078 : }
1145 :
1146 : /* map_private_test tests a range of lock versions matched their locally
1147 : cached versions at some point during the call. Specifically, tests
1148 : lock[lock_idx]==version[lock_idx] for all lock_idx in
1149 : [version_lock0,version_lock0+version_cnt) (cyclic). lock_cnt is the
1150 : number of locks and assumed to be positive and an integer
1151 : power-of-two. Returns SUCCESS (zero) if all match (i.e. no probe
1152 : sequences through slots covered by the locks between when the last
1153 : lock in the range was observed and this was called changed) and AGAIN
1154 : (negative) otherwise. This is a compiler memory fence. */
1155 :
1156 : static inline int
1157 : MAP_(private_test)( MAP_VERSION_T volatile const * lock,
1158 : ulong lock_cnt,
1159 : MAP_VERSION_T const * version,
1160 : ulong lock_idx, /* version_lock0 */
1161 3743964 : ulong version_cnt ) {
1162 3743964 : FD_COMPILER_MFENCE();
1163 7933995 : for( ; version_cnt; version_cnt-- ) {
1164 4190031 : if( FD_UNLIKELY( lock[ lock_idx ]!=version[ lock_idx ] ) ) break; /* opt for low contention */
1165 4190031 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
1166 4190031 : }
1167 3743964 : FD_COMPILER_MFENCE();
1168 3743964 : return version_cnt ? FD_MAP_ERR_AGAIN : FD_MAP_SUCCESS; /* cmov */
1169 3743964 : }
1170 :
1171 : /* map_private_lock returns the version of the lock observed at some
1172 : point during the call. Assumes lock is valid. If the least
1173 : significant bit of the returned version is set (i.e. is odd), the
1174 : caller did not get the lock (i.e. an operation was already in
1175 : progress on a key whose probe sequence includes a map slot covered by
1176 : this lock). If the LSB is clear (i.e. is even), the caller got the
1177 : lock (i.e. is free to do an operation involving probe sequences that
1178 : pass through the range covered by the lock) and *lock LSB was set.
1179 : This is a compiler memory fence. When the target does not have
1180 : FD_HAS_ATOMIC, this operation is emulated. When emulated, the map
1181 : will not be safe to use concurrently but will still work with
1182 : comparable performance to a serial implementation. */
1183 :
1184 : static inline MAP_VERSION_T
1185 41844600 : MAP_(private_lock)( MAP_VERSION_T volatile * l ) {
1186 41844600 : MAP_VERSION_T v;
1187 41844600 : FD_COMPILER_MFENCE();
1188 : # if FD_HAS_ATOMIC /* test-and-test-and-set style */
1189 41844600 : v = *l;
1190 41844600 : if( FD_LIKELY( !((ulong)v & 1UL) ) ) v = FD_ATOMIC_FETCH_AND_OR( l, (MAP_VERSION_T)1 ); /* opt for low contention */
1191 : # else
1192 : v = *l;
1193 : *l = (MAP_VERSION_T)((ulong)v | 1UL);
1194 : # endif
1195 41844600 : FD_COMPILER_MFENCE();
1196 41844600 : return v;
1197 41844600 : }
1198 :
1199 : /* map_private_unlock unlocks lock[lock_idx] for lock_idx in
1200 : [version_lock0,version_lock0+version_cnt) (cyclic). Assumes
1201 : version[lock_idx] is the version the caller wants post unlock (which
1202 : implies that, on entry, version[lock_idx] = lock[lock_idx] + delta
1203 : where delta is odd and >=-1 (the -1 case corresponds "unlock with no
1204 : changes made to the covered elements"). This cannot fail. This is a
1205 : compiler memory fence. */
1206 :
1207 : static inline void
1208 : MAP_(private_unlock)( MAP_VERSION_T volatile * lock,
1209 : ulong lock_cnt,
1210 : MAP_VERSION_T const * version,
1211 : ulong lock_idx, /* version_lock0 */
1212 29991768 : ulong version_cnt ) {
1213 29991768 : FD_COMPILER_MFENCE();
1214 56830494 : for( ; version_cnt; version_cnt-- ) {
1215 26838726 : lock[ lock_idx ] = version[ lock_idx ];
1216 26838726 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
1217 26838726 : }
1218 29991768 : FD_COMPILER_MFENCE();
1219 29991768 : }
1220 :
1221 : /* map_private_ele_{is_free,free,move} expose the
1222 : MAP_ELE_{IS_FREE,FREE,MOVE} macros as inlines with strict semantics.
1223 :
1224 : map_private_ele_is_free returns 1 if ele does not contain a key-val
1225 : pair and 0 otherwise. ctx will be the join's user context, ele will
1226 : be a valid pointer to an element store element in the caller's
1227 : address space that is safe to speculate on. Retains no interest in
1228 : ele.
1229 :
1230 : map_private_ele_free frees any key and/or val resources used by ele
1231 : and marks ele as free. ctx will be the join's user context, ele will
1232 : be a valid pointer to an element store element in the caller's
1233 : address space that is marked as used. Retains no interest in ele.
1234 :
1235 : map_private_ele_move moves the key-val pair from element src to
1236 : element dst and marks src as free. ctx will be the join's user
1237 : context, src/dst will be a valid pointers to an element store element
1238 : in the caller's address space. dst/src will be marked as free/used
1239 : on entry and should be marked as used/free on return. Retains no
1240 : interest in dst or src. */
1241 :
1242 : FD_FN_PURE static inline int
1243 : MAP_(private_ele_is_free)( void const * ctx,
1244 52867698 : MAP_ELE_T const * ele ) {
1245 52867698 : (void)ctx;
1246 52867698 : return !!(MAP_ELE_IS_FREE( (ctx), (ele) ));
1247 52867698 : }
1248 :
1249 : static inline void
1250 : MAP_(private_ele_free)( void * ctx,
1251 3750408 : MAP_ELE_T * ele ) {
1252 3750408 : (void)ctx;
1253 3750408 : MAP_ELE_FREE( (ctx), (ele) );
1254 1875204 : }
1255 :
1256 : static inline void
1257 : MAP_(private_ele_move)( void * ctx,
1258 : MAP_ELE_T * dst,
1259 341448 : MAP_ELE_T * src ) {
1260 341448 : (void)ctx;
1261 341448 : MAP_ELE_MOVE( (ctx), (dst), (src) );
1262 170652 : }
1263 :
1264 9 : FD_FN_CONST static inline ulong MAP_(lock_max)( void ) { return MAP_LOCK_MAX; }
1265 :
1266 3000006 : FD_FN_CONST static inline ulong MAP_(lock_cnt_est) ( ulong ele_max ) { return fd_ulong_min( ele_max, MAP_LOCK_MAX ); }
1267 3000006 : FD_FN_CONST static inline ulong MAP_(probe_max_est)( ulong ele_max ) { return ele_max; }
1268 :
1269 192 : FD_FN_CONST static inline ulong MAP_(align)( void ) { return alignof(MAP_(shmem_t)); }
1270 :
1271 : FD_FN_CONST static inline ulong
1272 : MAP_(footprint)( ulong ele_max,
1273 : ulong lock_cnt,
1274 42 : ulong probe_max ) {
1275 42 : if( !( fd_ulong_is_pow2( ele_max ) &
1276 42 : fd_ulong_is_pow2( lock_cnt ) & (lock_cnt<=fd_ulong_min( ele_max, MAP_LOCK_MAX )) &
1277 42 : (1UL<=probe_max) & (probe_max<=ele_max) ) ) return 0UL;
1278 12 : return fd_ulong_align_up( sizeof(MAP_(shmem_t)) + lock_cnt*sizeof(MAP_VERSION_T), alignof(MAP_(shmem_t)) ); /* no overflow */
1279 42 : }
1280 :
1281 9 : FD_FN_PURE static inline ulong MAP_(ele_max) ( MAP_(t) const * join ) { return join->ele_max; }
1282 9 : FD_FN_PURE static inline ulong MAP_(lock_cnt) ( MAP_(t) const * join ) { return join->lock_cnt; }
1283 3 : FD_FN_PURE static inline ulong MAP_(probe_max)( MAP_(t) const * join ) { return join->probe_max; }
1284 9 : FD_FN_PURE static inline ulong MAP_(seed) ( MAP_(t) const * join ) { return join->seed; }
1285 :
1286 3 : FD_FN_PURE static inline void const * MAP_(shmap_const)( MAP_(t) const * join ) { return ((MAP_(shmem_t) const *)join->lock)-1; }
1287 3 : FD_FN_PURE static inline void const * MAP_(shele_const)( MAP_(t) const * join ) { return join->ele; }
1288 :
1289 3 : FD_FN_CONST static inline void * MAP_(ctx) ( MAP_(t) * join ) { return join->ctx; }
1290 3 : FD_FN_CONST static inline void const * MAP_(ctx_const)( MAP_(t) const * join ) { return join->ctx; }
1291 3 : FD_FN_CONST static inline ulong MAP_(ctx_max) ( MAP_(t) const * join ) { (void)join; return MAP_CTX_MAX; }
1292 :
1293 3 : FD_FN_PURE static inline void * MAP_(shmap)( MAP_(t) * join ) { return ((MAP_(shmem_t) *)join->lock)-1; }
1294 9 : FD_FN_PURE static inline void * MAP_(shele)( MAP_(t) * join ) { return join->ele; }
1295 :
1296 12288 : FD_FN_PURE static inline ulong MAP_(ele_lock) ( MAP_(t) const * join, ulong ele_idx ) { return ele_idx >> join->lock_shift; }
1297 7477266 : FD_FN_PURE static inline ulong MAP_(lock_ele0)( MAP_(t) const * join, ulong lock_idx ) { return lock_idx << join->lock_shift; }
1298 7477266 : FD_FN_PURE static inline ulong MAP_(lock_ele1)( MAP_(t) const * join, ulong lock_idx ) { return (lock_idx+1UL) << join->lock_shift; }
1299 :
1300 : FD_FN_PURE static inline int
1301 : MAP_(key_eq)( MAP_KEY_T const * k0,
1302 31186575 : MAP_KEY_T const * k1 ) {
1303 31186575 : return !!(MAP_KEY_EQ( (k0), (k1) ));
1304 31186575 : }
1305 :
1306 : FD_FN_PURE static inline ulong
1307 : MAP_(key_hash)( MAP_KEY_T const * key,
1308 78672240 : ulong seed ) {
1309 78672240 : return (MAP_KEY_HASH( (key), (seed) ));
1310 78672240 : }
1311 :
1312 : static inline void
1313 : MAP_(backoff)( ulong scale,
1314 0 : ulong seed ) {
1315 0 : ulong r = (ulong)(uint)fd_ulong_hash( seed ^ (((ulong)fd_tickcount())<<32) );
1316 0 : for( ulong rem=(scale*r)>>48; rem; rem-- ) FD_SPIN_PAUSE();
1317 0 : }
1318 :
1319 37486944 : FD_FN_PURE static inline ulong MAP_(query_memo )( MAP_(query_t) const * query ) { return query->memo; }
1320 7489380 : FD_FN_PURE static inline MAP_ELE_T const * MAP_(query_ele_const)( MAP_(query_t) const * query ) { return query->ele; }
1321 15005874 : FD_FN_PURE static inline MAP_ELE_T * MAP_(query_ele )( MAP_(query_t) * query ) { return query->ele; }
1322 :
1323 : static inline void
1324 7498842 : MAP_(publish)( MAP_(query_t) * query ) {
1325 7498842 : MAP_VERSION_T volatile * l = query->l;
1326 7498842 : MAP_VERSION_T v = (MAP_VERSION_T)((ulong)query->v + 2UL);
1327 7498842 : FD_COMPILER_MFENCE();
1328 7498842 : *l = v;
1329 7498842 : FD_COMPILER_MFENCE();
1330 7498842 : }
1331 :
1332 : static inline void
1333 7507032 : MAP_(cancel)( MAP_(query_t) * query ) {
1334 7507032 : MAP_VERSION_T volatile * l = query->l;
1335 7507032 : MAP_VERSION_T v = query->v;
1336 7507032 : FD_COMPILER_MFENCE();
1337 7507032 : *l = v;
1338 7507032 : FD_COMPILER_MFENCE();
1339 7507032 : }
1340 :
1341 : static inline int
1342 3745416 : MAP_(query_test)( MAP_(query_t) const * query ) {
1343 3745416 : MAP_VERSION_T volatile const * l = query->l;
1344 3745416 : ulong v = query->v;
1345 3745416 : FD_COMPILER_MFENCE();
1346 3745416 : ulong _v = *l;
1347 3745416 : FD_COMPILER_MFENCE();
1348 3745416 : return _v==v ? FD_MAP_SUCCESS : FD_MAP_ERR_AGAIN;
1349 3745416 : }
1350 :
1351 : static inline void
1352 : MAP_(unlock_range)( MAP_(t) * join,
1353 : ulong range_start,
1354 : ulong range_cnt,
1355 3739854 : MAP_VERSION_T const * version ) {
1356 3739854 : MAP_(private_unlock)( join->lock, join->lock_cnt, version, range_start, range_cnt );
1357 3739854 : }
1358 :
1359 7492350 : FD_FN_PURE static inline int MAP_(iter_done)( MAP_(iter_t) * iter ) { return !iter->ele_rem; }
1360 3743982 : FD_FN_PURE static inline MAP_ELE_T * MAP_(iter_ele) ( MAP_(iter_t) * iter ) { return iter->ele + iter->ele_idx; }
1361 :
1362 : static inline MAP_(iter_t) *
1363 3748368 : MAP_(iter_fini)( MAP_(iter_t) * iter ) {
1364 3748368 : MAP_(private_unlock)( iter->lock, iter->lock_cnt, iter->version, iter->version_lock0, iter->version_cnt );
1365 3748368 : return iter;
1366 3748368 : }
1367 :
1368 : MAP_STATIC void * MAP_(new) ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
1369 : MAP_STATIC MAP_(t) * MAP_(join) ( void * ljoin, void * shmap, void * shele );
1370 : MAP_STATIC void * MAP_(leave) ( MAP_(t) * join );
1371 : MAP_STATIC void * MAP_(delete)( void * shmap );
1372 :
1373 : MAP_STATIC void
1374 : MAP_(hint)( MAP_(t) const * join,
1375 : MAP_KEY_T const * key,
1376 : MAP_(query_t) * query,
1377 : int flags );
1378 :
1379 : MAP_STATIC int
1380 : MAP_(prepare)( MAP_(t) * join,
1381 : MAP_KEY_T const * key,
1382 : MAP_ELE_T * sentinel,
1383 : MAP_(query_t) * query,
1384 : int flags );
1385 :
1386 : MAP_STATIC int
1387 : MAP_(remove)( MAP_(t) * join,
1388 : MAP_KEY_T const * key,
1389 : MAP_(query_t) const * query,
1390 : int flags );
1391 :
1392 : MAP_STATIC int
1393 : MAP_(query_try)( MAP_(t) const * join,
1394 : MAP_KEY_T const * key,
1395 : MAP_ELE_T const * sentinel,
1396 : MAP_(query_t) * query,
1397 : int flags );
1398 :
1399 : /* FIXME: Consider adding txn API too? Would work recording the start
1400 : of probe sequences for keys in the transaction and then the txn_try
1401 : would use a bitfield to lock all contiguous regions covered by the
1402 : set of probe sequences. */
1403 :
1404 : MAP_STATIC int
1405 : MAP_(lock_range)( MAP_(t) * join,
1406 : ulong range_start,
1407 : ulong range_cnt,
1408 : int flags,
1409 : MAP_VERSION_T * version );
1410 :
1411 : MAP_STATIC int
1412 : MAP_(iter_init)( MAP_(t) * join,
1413 : ulong memo,
1414 : int flags,
1415 : MAP_(iter_t) * iter );
1416 :
1417 : MAP_STATIC MAP_(iter_t) *
1418 : MAP_(iter_next)( MAP_(iter_t) * iter );
1419 :
1420 : MAP_STATIC int MAP_(verify)( MAP_(t) const * join );
1421 :
1422 : MAP_STATIC FD_FN_CONST char const * MAP_(strerror)( int err );
1423 :
1424 : FD_PROTOTYPES_END
1425 :
1426 : #endif
1427 :
1428 : #if MAP_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
1429 :
1430 : #include "../log/fd_log.h" /* Used by constructors and verify (FIXME: Consider making a compile time option) */
1431 :
1432 : MAP_STATIC void *
1433 : MAP_(new)( void * shmem,
1434 : ulong ele_max,
1435 : ulong lock_cnt,
1436 : ulong probe_max,
1437 27 : ulong seed ) {
1438 :
1439 27 : if( FD_UNLIKELY( !shmem ) ) {
1440 3 : FD_LOG_WARNING(( "NULL shmem" ));
1441 3 : return NULL;
1442 3 : }
1443 :
1444 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, MAP_(align)() ) ) ) {
1445 3 : FD_LOG_WARNING(( "misaligned shmem" ));
1446 3 : return NULL;
1447 3 : }
1448 :
1449 21 : ulong footprint = MAP_(footprint)( ele_max, lock_cnt, probe_max );
1450 21 : if( FD_UNLIKELY( !footprint ) ) {
1451 15 : FD_LOG_WARNING(( "ele_max, lock_cnt and/or probe_max" ));
1452 15 : return NULL;
1453 15 : }
1454 :
1455 : /* seed arbitrary */
1456 :
1457 : /* Init the metadata */
1458 :
1459 6 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmem;
1460 :
1461 6 : memset( map, 0, footprint );
1462 :
1463 6 : map->ele_max = ele_max;
1464 6 : map->lock_cnt = lock_cnt;
1465 6 : map->probe_max = probe_max;
1466 6 : map->seed = seed;
1467 6 : map->lock_shift = fd_ulong_find_msb( ele_max ) - fd_ulong_find_msb( lock_cnt );
1468 :
1469 : /* Note: memset set all the locks to version 0/unlocked */
1470 :
1471 : /* Note: caller set all elements in underlying element store set to
1472 : free (or, more pedantically, to a key-val pair configuration
1473 : consistent with ele_max and probe_max). */
1474 :
1475 6 : FD_COMPILER_MFENCE();
1476 6 : map->magic = MAP_MAGIC;
1477 6 : FD_COMPILER_MFENCE();
1478 :
1479 6 : return shmem;
1480 21 : }
1481 :
1482 : MAP_STATIC MAP_(t) *
1483 : MAP_(join)( void * ljoin,
1484 : void * shmap,
1485 27 : void * shele ) {
1486 27 : MAP_(t) * join = (MAP_(t) *)ljoin;
1487 27 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1488 27 : MAP_ELE_T * ele = (MAP_ELE_T *)shele;
1489 :
1490 27 : if( FD_UNLIKELY( !join ) ) {
1491 3 : FD_LOG_WARNING(( "NULL ljoin" ));
1492 3 : return NULL;
1493 3 : }
1494 :
1495 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) ) ) {
1496 3 : FD_LOG_WARNING(( "misaligned ljoin" ));
1497 3 : return NULL;
1498 3 : }
1499 :
1500 21 : if( FD_UNLIKELY( !map ) ) {
1501 3 : FD_LOG_WARNING(( "NULL shmap" ));
1502 3 : return NULL;
1503 3 : }
1504 :
1505 18 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1506 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1507 3 : return NULL;
1508 3 : }
1509 :
1510 15 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1511 3 : FD_LOG_WARNING(( "bad magic" ));
1512 3 : return NULL;
1513 3 : }
1514 :
1515 12 : if( FD_UNLIKELY( !ele ) ) {
1516 3 : FD_LOG_WARNING(( "NULL shele" ));
1517 3 : return NULL;
1518 3 : }
1519 :
1520 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)ele, alignof(MAP_ELE_T) ) ) ) {
1521 3 : FD_LOG_WARNING(( "misaligned shele" ));
1522 3 : return NULL;
1523 3 : }
1524 :
1525 6 : join->lock = (MAP_VERSION_T *)(map+1);
1526 6 : join->ele = ele;
1527 6 : join->ele_max = map->ele_max;
1528 6 : join->lock_cnt = map->lock_cnt;
1529 6 : join->probe_max = map->probe_max;
1530 6 : join->seed = map->seed;
1531 6 : join->lock_shift = map->lock_shift;
1532 :
1533 6 : return join;
1534 9 : }
1535 :
1536 : MAP_STATIC void *
1537 9 : MAP_(leave)( MAP_(t) * join ) {
1538 :
1539 9 : if( FD_UNLIKELY( !join ) ) {
1540 3 : FD_LOG_WARNING(( "NULL join" ));
1541 3 : return NULL;
1542 3 : }
1543 :
1544 6 : return (void *)join;
1545 9 : }
1546 :
1547 : MAP_STATIC void *
1548 15 : MAP_(delete)( void * shmap ) {
1549 15 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1550 :
1551 15 : if( FD_UNLIKELY( !map ) ) {
1552 3 : FD_LOG_WARNING(( "NULL shmap" ));
1553 3 : return NULL;
1554 3 : }
1555 :
1556 12 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1557 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1558 3 : return NULL;
1559 3 : }
1560 :
1561 9 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1562 3 : FD_LOG_WARNING(( "bad magic" ));
1563 3 : return NULL;
1564 3 : }
1565 :
1566 6 : FD_COMPILER_MFENCE();
1567 6 : map->magic = 0UL;
1568 6 : FD_COMPILER_MFENCE();
1569 :
1570 6 : return (void *)map;
1571 9 : }
1572 :
1573 : void
1574 : MAP_(hint)( MAP_(t) const * join,
1575 : MAP_KEY_T const * key,
1576 : MAP_(query_t) * query,
1577 14991690 : int flags ) {
1578 14991690 : MAP_ELE_T const * ele0 = join->ele;
1579 14991690 : MAP_VERSION_T const * lock = join->lock;
1580 14991690 : ulong ele_max = join->ele_max;
1581 14991690 : ulong seed = join->seed;
1582 14991690 : int lock_shift = join->lock_shift;
1583 :
1584 14991690 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
1585 14991690 : ulong ele_idx = memo & (ele_max-1UL);
1586 14991690 : ulong lock_idx = ele_idx >> lock_shift;
1587 :
1588 : /* TODO: target specific prefetch hints */
1589 14991690 : if( FD_LIKELY( flags & FD_MAP_FLAG_PREFETCH_META ) ) FD_VOLATILE_CONST( lock[ lock_idx ] );
1590 14991690 : if( FD_LIKELY( flags & FD_MAP_FLAG_PREFETCH_DATA ) ) FD_VOLATILE_CONST( ele0[ ele_idx ] );
1591 :
1592 14991690 : query->memo = memo;
1593 14991690 : }
1594 :
1595 : int
1596 : MAP_(prepare)( MAP_(t) * join,
1597 : MAP_KEY_T const * key,
1598 : MAP_ELE_T * sentinel,
1599 : MAP_(query_t) * query,
1600 15005874 : int flags ) {
1601 15005874 : MAP_ELE_T * ele0 = join->ele;
1602 15005874 : MAP_VERSION_T * lock = join->lock;
1603 15005874 : ulong ele_max = join->ele_max;
1604 15005874 : ulong lock_cnt = join->lock_cnt;
1605 15005874 : ulong probe_max = join->probe_max;
1606 15005874 : ulong seed = join->seed;
1607 15005874 : int lock_shift = join->lock_shift;
1608 15005874 : void * ctx = join->ctx;
1609 :
1610 15005874 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
1611 15005874 : ulong start_idx = memo & (ele_max-1UL);
1612 15005874 : ulong version_lock0 = start_idx >> lock_shift;
1613 :
1614 15005874 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
1615 15005874 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
1616 15005874 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
1617 :
1618 15005874 : for(;;) { /* Fresh try */
1619 :
1620 15005874 : int err;
1621 :
1622 15005874 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1623 15005874 : ulong version_cnt = 0UL;
1624 15005874 : ulong lock_idx = version_lock0;
1625 :
1626 : /* At this point, finding any key in the map requires testing at
1627 : most probe_max contiguous slots. */
1628 :
1629 15005874 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1630 15005874 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1631 15005874 : version[ lock_idx ] = v;
1632 15005874 : version_cnt++;
1633 :
1634 15005874 : ulong ele_idx = start_idx;
1635 :
1636 17086713 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
1637 :
1638 : /* At this point, we've acquired the locks from the start of key's
1639 : probe sequence to ele_idx inclusive and have tested fewer than
1640 : probe_max slots for key.
1641 :
1642 : If slot ele_idx is empty, we know that the key is currently not
1643 : in the map and we can insert it here without creating a probe
1644 : sequence longer than probe_max. This does not lengthen the
1645 : probe sequence for any currently mapped keys, preserving the
1646 : maximum probe sequence length invariant. Further, this is at
1647 : the end of all keys that map to the same probe sequence start.
1648 : So, we have preserved the key group ordering invariant.
1649 :
1650 : On return, ele will be marked as free. To insert key into the
1651 : map, the caller should initialize the slot's key (and memo if
1652 : necessary), mark the slot as used, and publish to complete the
1653 : insert.
1654 :
1655 : If the caller doesn't want to insert anything (e.g. caller only
1656 : wants to modify an existing value), the caller should keep the
1657 : slot marked as free (doesn't matter how the caller modified any
1658 : other fields) and return the slot as free, and cancel to
1659 : complete the failed insert (publish would also work ... cancel
1660 : has theoretically lower risk of false contention).
1661 :
1662 : Likewise, if slot ele_idx contains key, we return that slot to
1663 : the caller. The caller can tell the difference between the
1664 : previous case because the slot will be marked as used.
1665 :
1666 : On return, the caller can modify the slot's value arbitrarily.
1667 : IMPORTANT SAFETY TIP! THE CALLER MUST NOT MODIFY THE SLOT'S KEY
1668 : OR MARK THE SLOT AS FREE. USE REMOVE BELOW TO REMOVE KEYS.
1669 : When done modifying the slot's value, the caller should either
1670 : publish or cancel depending on what the caller did to the
1671 : slot's value and how the application manages access to values
1672 : (publish is always safe but cancel when appropriate has
1673 : theoretically lower risk of false contention). Note that
1674 : cancel is not appropriate for temporary modifications to value
1675 : (because it can confuse query ABA protection).
1676 :
1677 : In both cases, since we have the lock that covers slot ele_idx,
1678 : we can unlock any other locks (typically the leading
1679 : version_cnt-1 but possibly the trailing version_cnt-1 in cases
1680 : with maps near capacity) locks already acquired to reduce
1681 : contention with other unrelated operations. That is, at this
1682 : point, lock lock_idx is sufficient to prevent any operation for
1683 : any key breaking key's probe sequence (because it would need to
1684 : acquire the lock covering ele_idx first). */
1685 :
1686 17086713 : MAP_ELE_T * ele = ele0 + ele_idx;
1687 :
1688 17086713 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) || /* opt for low collision */
1689 17086713 : (
1690 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1691 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
1692 : # endif
1693 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for already in map */
1694 15005874 : ) ) {
1695 :
1696 15005874 : lock_idx = ele_idx >> lock_shift;
1697 15005874 : version_lock0 = (version_lock0 + (ulong)(version_lock0==lock_idx)) & (lock_cnt-1UL);
1698 15005874 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt-1UL );
1699 :
1700 15005874 : query->memo = memo;
1701 15005874 : query->ele = ele;
1702 15005874 : query->l = lock + lock_idx;
1703 15005874 : query->v = version[ lock_idx ];
1704 15005874 : return FD_MAP_SUCCESS;
1705 15005874 : }
1706 :
1707 : /* At this point, slot ele_idx is used by something other than
1708 : key. If we still have probes remaining, continue probing for
1709 : key, locking as necessary. If we can't acquire a lock, we
1710 : fail. */
1711 :
1712 2080839 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1713 :
1714 : /* FIXME: FURTHER RESTRICT TO PROBE_REM>1? */
1715 2080839 : ulong lock_next = ele_idx >> lock_shift;
1716 2080839 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks that cover many contiguous slots */
1717 1302039 : lock_idx = lock_next;
1718 :
1719 1302039 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1720 1302039 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1721 1302039 : version[ lock_idx ] = v;
1722 1302039 : version_cnt++;
1723 1302039 : }
1724 2080839 : }
1725 :
1726 : /* At this point, we've done probe_max probes without encountering
1727 : key and we have all the locks. So we know key is not in the map
1728 : and that, even if we have space, inserting this key will create a
1729 : probe sequence longer than probe_max. That is, map is loaded
1730 : enough that we consider it full.
1731 :
1732 : If probe_max==ele_max, this meaning of full is the traditional
1733 : non-concurrent meaning of full (literally every slot is known to
1734 : be used). Even if probe_max << ele_max, it is possible to fill
1735 : every slot (e.g. at probe_max==1, a perfect hash of ele_max keys
1736 : to slot would fill every slot). */
1737 :
1738 0 : err = FD_MAP_ERR_FULL;
1739 :
1740 0 : fail:
1741 :
1742 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1743 :
1744 0 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) {
1745 0 : query->memo = memo;
1746 0 : query->ele = sentinel;
1747 0 : query->l = NULL;
1748 0 : query->v = (MAP_VERSION_T)0;
1749 0 : return err;
1750 0 : }
1751 :
1752 : /* At this point, we hit contention and are blocking (need to try
1753 : again). We do a random exponential backoff (with saturation on
1754 : wrapping) to minimize contention with other threads. Normalizing
1755 : out fixed point scalings baked into the below, we spin pause a
1756 : uniform IID random number of times in [0,backoff_max) where
1757 : backoff_max is 1 on the first hit and increases by ~30% each time
1758 : to a maximum of 2^16 (i.e. hundreds microseconds per remaining
1759 : lock for typical CPU speeds and spin pause delays at maximum
1760 : backoff). */
1761 :
1762 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
1763 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
1764 0 : MAP_(backoff)( scale, backoff_seed );
1765 :
1766 0 : }
1767 :
1768 : /* never get here */
1769 :
1770 15005874 : }
1771 :
1772 : int
1773 : MAP_(remove)( MAP_(t) * join,
1774 : MAP_KEY_T const * key,
1775 : MAP_(query_t) const * query,
1776 7497672 : int flags ) {
1777 :
1778 7497672 : MAP_VERSION_T * lock = join->lock;
1779 7497672 : ulong lock_cnt = join->lock_cnt;
1780 7497672 : ulong seed = join->seed;
1781 7497672 : ulong probe_max = join->probe_max;
1782 7497672 : MAP_ELE_T * ele0 = join->ele;
1783 7497672 : ulong ele_max = join->ele_max;
1784 7497672 : int lock_shift = join->lock_shift;
1785 7497672 : void * ctx = join->ctx;
1786 :
1787 7497672 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
1788 7497672 : ulong start_idx = memo & (ele_max-1UL);
1789 7497672 : ulong version_lock0 = start_idx >> lock_shift;
1790 :
1791 7497672 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
1792 7497672 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
1793 7497672 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
1794 :
1795 7497672 : for(;;) { /* Fresh try */
1796 :
1797 7497672 : int err;
1798 :
1799 7497672 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1800 7497672 : ulong version_cnt = 0UL;
1801 7497672 : ulong lock_idx = version_lock0;
1802 :
1803 : /* At this point, we need to acquire locks covering the start of the
1804 : probe sequence through up to all contiguously used slots (and, if
1805 : the map is not completely full, the trailing empty slot). */
1806 :
1807 7497672 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1808 7497672 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1809 7497672 : version[ lock_idx ] = v;
1810 7497672 : version_cnt++;
1811 :
1812 7497672 : ulong ele_idx = start_idx;
1813 7497672 : ulong hole_idx = start_idx;
1814 7497672 : int found = 0;
1815 :
1816 7497672 : ulong contig_cnt;
1817 13317837 : for( contig_cnt=0UL; contig_cnt<ele_max; contig_cnt++ ) {
1818 :
1819 : /* At this point, we've acquired the locks covering slots
1820 : [start_idx,ele_idx] (cyclic) and have confirmed that the
1821 : contig_cnt slots [start_idx,ele_idx) (cyclic) are used.
1822 :
1823 : If slot ele_idx is empty, we are done probing.
1824 :
1825 : Otherwise, if we haven't found key yet, we test if slot ele_idx
1826 : contains key.
1827 :
1828 : We can optimize this further by noting that the key can only be
1829 : in the first probe_max probes and that when we don't find the
1830 : key, remove has nothing to do (such that we don't have to keep
1831 : probing for contiguous slots). */
1832 :
1833 13317837 : MAP_ELE_T const * ele = ele0 + ele_idx;
1834 :
1835 13317837 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break; /* opt for first pass low collision */
1836 :
1837 5820165 : if( FD_LIKELY( !found ) ) { /* opt for first pass low collision */
1838 4786326 : if( FD_UNLIKELY( contig_cnt>=probe_max ) ) break; /* opt for first pass low collision */
1839 4786326 : found =
1840 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1841 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
1842 : # endif
1843 4786326 : MAP_(key_eq)( &ele->MAP_KEY, key );
1844 4786326 : if( found ) hole_idx = ele_idx; /* cmov */
1845 4786326 : }
1846 :
1847 : /* Continue probing, locking as necessary. If we can't acquire a
1848 : lock, fail. */
1849 :
1850 5820165 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1851 :
1852 5820165 : ulong lock_next = ele_idx >> lock_shift;
1853 5820165 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks covering many contiguous slots */
1854 3638673 : lock_idx = lock_next;
1855 :
1856 3638673 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1857 3638673 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1858 3638673 : version[ lock_idx ] = v;
1859 3638673 : version_cnt++;
1860 3638673 : }
1861 5820165 : }
1862 :
1863 : /* At this point, if we haven't found the key, key did not exist in
1864 : the map at some point during the call. Release the locks and
1865 : tell the user the key was already removed. */
1866 :
1867 7497672 : if( FD_UNLIKELY( !found ) ) { err = FD_MAP_ERR_KEY; goto fail; }
1868 :
1869 : /* At this point, we have locks covering the contig_cnt used slots
1870 : starting from start_idx cyclic (and, if contig_cnt<ele_max, any
1871 : trailing empty slot). The key to remove is in this range at
1872 : hole_idx. Further, all probe sequences are intact. Make a hole
1873 : at hole_idx by freeing the key. Also update the cached lock
1874 : version to indicate "modified" when we unlock below. */
1875 :
1876 3750408 : MAP_(private_ele_free)( ctx, ele0 + hole_idx );
1877 :
1878 3750408 : lock_idx = hole_idx >> lock_shift;
1879 3750408 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)version[ lock_idx ] + 2UL);
1880 :
1881 : /* When contig_cnt<ele_max, the trailing empty slot guarantees that
1882 : the just made hole didn't break any probe sequences for keys not
1883 : in the contig_cnt slots and that it didn't break any probe
1884 : sequences in [start_idx,hole_idx). Probe sequences for keys in
1885 : (hole_idx,start_idx+contig_cnt) (cyclic) might have been broken
1886 : though.
1887 :
1888 : We fix the first key with a broken probe sequence by moving it to
1889 : the hole just made. This fills the hole but makes a new hole
1890 : (and one closer to the empty trailing slot) in the process. As
1891 : this shortens the probe sequence for that key, this doesn't break
1892 : any probe length invariants. We are repeating this process until
1893 : we've fixed all the contiguous slots after hole_idx. (As an
1894 : additional optimization to reduce remove costs when map is nearly
1895 : full but probe_max << ele_max, we could exploit that only the
1896 : leading probe_max-1 slots after any created hole might have
1897 : broken probe sequences.)
1898 :
1899 : Unfortunately, when contig_cnt==ele_max, we no longer have this
1900 : guarantee. But we do have the entire map locked at this point.
1901 : And we know that probe sequences are intact starting from the
1902 : most recently created hole. If we verify enough to eventually
1903 : wrap back to most recently created hole, we know all probe
1904 : sequences are intact. Since fixing broken probe sequences in
1905 : this fashion always shortens them and there always will be one
1906 : hole in this process, verifying until we hit the most recently
1907 : made hole is guaranteed to terminate. Since there is only one
1908 : hole, it is sufficient to just test if the next slot to verify is
1909 : a hole.
1910 :
1911 : This test works just as well for the more common
1912 : contig_cnt<ele_max case (it will terminate at the preexisting
1913 : trailing empty slot instead of the most recently created hole).
1914 : So, for code simplicity, we just do that.
1915 :
1916 : A nice side effect is this removal process is that implicitly
1917 : improves probing for remaining keys in the map and does not
1918 : require tombstones.
1919 :
1920 : TL;DR It's a bad idea on many levels to fill up linearly probed
1921 : maps to their absolute limits ... but this will still work if you
1922 : do.
1923 :
1924 : Note also that this process preserves the ordering of keys that
1925 : hash to the same slot (such that key group ordering is
1926 : preserved). */
1927 :
1928 3750408 : ele_idx = hole_idx;
1929 4784247 : for(;;) {
1930 4784247 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1931 :
1932 : /* At this point, slots (hole_idx,ele_idx) (cyclic) are used with
1933 : verified probe sequences. As per the above, we are guaranteed
1934 : to eventually hit an empty slot (typically very quickly in
1935 : practice) and hitting an empty slot guarantees all probe
1936 : sequences are intact (such that we are done). */
1937 :
1938 4784247 : MAP_ELE_T * ele = ele0 + ele_idx;
1939 4784247 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break;
1940 :
1941 : /* Otherwise, if ele_idx's key probe sequence doesn't start in
1942 : (hole_idx,ele_idx] (cyclic), its probe sequence is currently
1943 : broken by the hole at hole_idx. We fix it by moving ele_idx to
1944 : hole_idx. This shortens that key's probe sequence (preserving
1945 : the invariant) and makes a new hole at ele_idx. We mark the
1946 : lock version covering the new hole idx as modified for the
1947 : unlock below. Note that the version for the existing hole was
1948 : already marked as modified when the hole was created so we only
1949 : bump if ele_idx is covered by a different lock than hole_idx to
1950 : reduce version churn to near theoretical minimum. */
1951 :
1952 : # if MAP_MEMOIZE
1953 : memo = ele->MAP_MEMO;
1954 : # else
1955 1033839 : memo = MAP_(key_hash)( &ele->MAP_KEY, seed );
1956 1033839 : # endif
1957 1033839 : start_idx = memo & (ele_max-1UL);
1958 :
1959 1033839 : if( !( ((hole_idx<start_idx) & (start_idx<=ele_idx) ) |
1960 1033839 : ((hole_idx>ele_idx) & ((hole_idx<start_idx) | (start_idx<=ele_idx))) ) ) {
1961 :
1962 341448 : MAP_(private_ele_move)( ctx, ele0 + hole_idx, ele );
1963 :
1964 341448 : ulong lock_next = ele_idx >> lock_shift;
1965 341448 : version[ lock_next ] = (MAP_VERSION_T)((ulong)version[ lock_next ] + ((lock_next!=lock_idx) ? 2UL : 0UL) /* cmov */);
1966 341448 : lock_idx = lock_next;
1967 :
1968 341448 : hole_idx = ele_idx;
1969 341448 : }
1970 :
1971 1033839 : }
1972 :
1973 : /* At this point, key is removed and all remaining keys have intact
1974 : and ordered probe sequences and we have updated the necessary
1975 : version cache entries. Unlock and return success. */
1976 :
1977 3750408 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1978 3750408 : return FD_MAP_SUCCESS;
1979 :
1980 3747264 : fail:
1981 :
1982 3747264 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1983 :
1984 3747264 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) return err;
1985 :
1986 : /* At this point, we are blocking and hit contention. Backoff. See
1987 : note in prepare for how this works */
1988 :
1989 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
1990 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
1991 0 : MAP_(backoff)( scale, backoff_seed );
1992 0 : }
1993 :
1994 : /* never get here */
1995 7497672 : }
1996 :
1997 : int
1998 : MAP_(query_try)( MAP_(t) const * join,
1999 : MAP_KEY_T const * key,
2000 : MAP_ELE_T const * sentinel,
2001 : MAP_(query_t) * query,
2002 7489380 : int flags ) {
2003 :
2004 7489380 : MAP_ELE_T * ele0 = join->ele;
2005 7489380 : MAP_VERSION_T * lock = join->lock;
2006 7489380 : ulong ele_max = join->ele_max;
2007 7489380 : ulong lock_cnt = join->lock_cnt;
2008 7489380 : ulong probe_max = join->probe_max;
2009 7489380 : ulong seed = join->seed;
2010 7489380 : int lock_shift = join->lock_shift;
2011 7489380 : void const * ctx = join->ctx;
2012 :
2013 7489380 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
2014 7489380 : ulong start_idx = memo & (ele_max-1UL);
2015 7489380 : ulong version_lock0 = start_idx >> lock_shift;
2016 :
2017 7489380 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2018 7489380 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2019 7489380 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2020 :
2021 7489380 : for(;;) { /* fresh try */
2022 :
2023 7489380 : int err;
2024 :
2025 7489380 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
2026 7489380 : ulong version_cnt = 0UL;
2027 7489380 : ulong lock_idx = version_lock0;
2028 :
2029 : /* At this point, finding any key in the map requires probing at
2030 : most probe_max contiguous slots. */
2031 :
2032 7489380 : MAP_VERSION_T v = MAP_(private_try)( lock + lock_idx );
2033 7489380 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
2034 7489380 : version[ lock_idx ] = v;
2035 7489380 : version_cnt++;
2036 :
2037 7489380 : ulong ele_idx = start_idx;
2038 :
2039 8524542 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
2040 :
2041 : /* At this point, we've observed the locks covering the start of
2042 : key's probe sequence to ele_idx inclusive, they were unlocked
2043 : when observed and we have done fewer than probe_max probes.
2044 :
2045 : If slot ele_idx is empty, we speculate that key was not in the
2046 : map at some point during the call.
2047 :
2048 : If slot ele_idx holds key, we let the caller continue speculating
2049 : about key's value. We only need to observe the lock covering key
2050 : after we've found it (if key gets moved or removed, the version
2051 : of the lock covering it will change). */
2052 :
2053 8524542 : MAP_ELE_T const * ele = ele0 + ele_idx;
2054 :
2055 8524542 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) { err = FD_MAP_ERR_KEY; goto fail; } /* opt for low collision */
2056 :
2057 4780578 : if(
2058 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
2059 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
2060 : # endif
2061 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for found */
2062 4780578 : ) {
2063 :
2064 3745416 : lock_idx = ele_idx >> lock_shift;
2065 :
2066 3745416 : query->memo = memo;
2067 3745416 : query->ele = (MAP_ELE_T *)ele;
2068 3745416 : query->l = lock + lock_idx;
2069 3745416 : query->v = version[ lock_idx ];
2070 3745416 : return FD_MAP_SUCCESS;
2071 3745416 : }
2072 :
2073 : /* At this point, we speculate slot ele_idx was used by something
2074 : other than key when observed. Continue probing slot for key,
2075 : observing locks as necessary. */
2076 :
2077 1035162 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2078 :
2079 : /* FIXME: FURTHER RESTRICT TO PROBE_REM>1? */
2080 1035162 : ulong lock_next = ele_idx >> lock_shift;
2081 1035162 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks cover many contiguous slots */
2082 646698 : lock_idx = lock_next;
2083 :
2084 646698 : v = MAP_(private_try)( lock + lock_idx );
2085 646698 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
2086 646698 : version[ lock_idx ] = v;
2087 646698 : version_cnt++;
2088 646698 : }
2089 1035162 : }
2090 :
2091 : /* At this point, we did probe_max probes without finding key. We
2092 : speculate key was not in the map at some point during the call. */
2093 :
2094 0 : err = FD_MAP_ERR_KEY;
2095 :
2096 3743964 : fail:
2097 :
2098 : /* If we didn't encounter any contention (i.e. no version numbers
2099 : changed), we can trust our speculated error. Otherwise, we tell
2100 : the user to try again. */
2101 :
2102 3743964 : err = MAP_(private_test)( lock, lock_cnt, version, version_lock0, version_cnt ) ? FD_MAP_ERR_AGAIN : err; /* cmov */
2103 :
2104 3743964 : fail_fast: /* Used when the err is already AGAIN */
2105 :
2106 3743964 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) {
2107 3743964 : query->memo = memo;
2108 3743964 : query->ele = (MAP_ELE_T *)sentinel;
2109 3743964 : query->l = NULL;
2110 3743964 : query->v = (MAP_VERSION_T)0;
2111 3743964 : return err;
2112 3743964 : }
2113 :
2114 : /* At this point, we are blocking and hit contention. Backoff. See
2115 : note in prepare for how this works */
2116 :
2117 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2118 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2119 0 : MAP_(backoff)( scale, backoff_seed );
2120 0 : }
2121 : /* never get here */
2122 7489380 : }
2123 :
2124 : int
2125 : MAP_(lock_range)( MAP_(t) * join,
2126 : ulong range_start,
2127 : ulong range_cnt,
2128 : int flags,
2129 3739854 : MAP_VERSION_T * version ) {
2130 3739854 : MAP_VERSION_T * lock = join->lock;
2131 3739854 : ulong lock_cnt = join->lock_cnt;
2132 :
2133 3739854 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2134 3739854 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2135 3739854 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2136 3739854 : ulong version_delta = (flags & FD_MAP_FLAG_RDONLY) ? 0UL : 2UL;
2137 :
2138 3739854 : for(;;) { /* fresh try */
2139 :
2140 3739854 : ulong lock_idx = range_start;
2141 3739854 : ulong locked_cnt = 0UL;
2142 11201760 : for( ; locked_cnt<range_cnt; locked_cnt++ ) {
2143 7461906 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2144 7461906 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2145 7461906 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2146 7461906 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
2147 7461906 : }
2148 :
2149 3739854 : return FD_MAP_SUCCESS;
2150 :
2151 0 : fail:
2152 :
2153 0 : MAP_(private_unlock)( lock, lock_cnt, version, range_start, locked_cnt );
2154 :
2155 0 : if( FD_UNLIKELY( non_blocking ) ) return FD_MAP_ERR_AGAIN;
2156 :
2157 : /* At this point, we are blocking and hit contention. Backoff. See
2158 : note in prepare for how this works */
2159 :
2160 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2161 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2162 0 : MAP_(backoff)( scale, backoff_seed );
2163 0 : }
2164 : /* never get here */
2165 3739854 : }
2166 :
2167 : int
2168 : MAP_(iter_init)( MAP_(t) * join,
2169 : ulong memo,
2170 : int flags,
2171 3748368 : MAP_(iter_t) * iter ) {
2172 :
2173 3748368 : MAP_ELE_T * ele0 = join->ele;
2174 3748368 : MAP_VERSION_T * lock = join->lock;
2175 3748368 : ulong ele_max = join->ele_max;
2176 3748368 : ulong lock_cnt = join->lock_cnt;
2177 3748368 : ulong probe_max = join->probe_max;
2178 3748368 : ulong seed = join->seed;
2179 3748368 : int lock_shift = join->lock_shift;
2180 3748368 : void * ctx = join->ctx;
2181 :
2182 3748368 : MAP_VERSION_T * version = iter->version;
2183 :
2184 3748368 : ulong start_idx = memo & (ele_max-1UL);
2185 3748368 : ulong version_lock0 = start_idx >> lock_shift;
2186 3748368 : ulong version_delta = (flags & FD_MAP_FLAG_RDONLY) ? 0UL : 2UL;
2187 :
2188 3748368 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2189 3748368 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2190 3748368 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2191 :
2192 3748368 : for(;;) { /* fresh try */
2193 :
2194 3748368 : ulong version_cnt = 0UL;
2195 3748368 : ulong lock_idx = version_lock0;
2196 :
2197 : /* At this point, finding any key-val pair that matches memo in the
2198 : map requires probing at most probe_max contiguous slots. */
2199 :
2200 3748368 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2201 3748368 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2202 3748368 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2203 3748368 : version_cnt++;
2204 :
2205 3748368 : ulong ele_idx = start_idx;
2206 3748368 : ulong ele_rem = 0UL;
2207 :
2208 3748368 : ulong iter_cnt = 0UL;
2209 3748368 : ulong iter_start = start_idx;
2210 :
2211 8845947 : for( ; ele_rem<probe_max; ele_rem++ ) {
2212 :
2213 : /* At this point, we've acquired the locks covering slots
2214 : [start_idx,ele_idx] (cyclic) and have confirmed that the
2215 : ele_rem slots [start_idx,ele_idx) (cyclic) are used. If slot
2216 : ele_idx is empty, we are done probing. */
2217 :
2218 8845947 : MAP_ELE_T const * ele = ele0 + ele_idx;
2219 :
2220 8845947 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break; /* opt for first pass low collision */
2221 :
2222 5097579 : iter_start = fd_ulong_if( iter_cnt==0UL, ele_idx, iter_start );
2223 : # if MAP_MEMOIZE
2224 : iter_cnt += (ulong)(ele->MAP_MEMO==memo);
2225 : # else
2226 5097579 : iter_cnt += (ulong)(MAP_(key_hash)( &ele->MAP_KEY, seed )==memo);
2227 5097579 : # endif
2228 :
2229 : /* Continue probing, locking as necessary. If we can't acquire a
2230 : lock, fail. */
2231 :
2232 5097579 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2233 :
2234 5097579 : ulong lock_next = ele_idx >> lock_shift;
2235 5097579 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks covering many contiguous slots */
2236 3190068 : lock_idx = lock_next;
2237 :
2238 3190068 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2239 3190068 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2240 3190068 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2241 3190068 : version_cnt++;
2242 3190068 : }
2243 5097579 : }
2244 :
2245 : /* At this point, we've acquired the locks covering used slots
2246 : [start_idx,start_idx+ele_rem) (cyclic) where ele_rem<=probe_max
2247 : (and, if ele_rem<probe_max, any trailing empty slot). iter_cnt
2248 : is the number of slots that matched in this range and iter_start
2249 : is the index of the first element in this range that matched
2250 : (start_idx if no matches). */
2251 :
2252 3748368 : iter->ele = ele0;
2253 3748368 : iter->lock = lock;
2254 3748368 : iter->ele_max = ele_max;
2255 3748368 : iter->lock_cnt = lock_cnt;
2256 3748368 : iter->seed = seed;
2257 3748368 : iter->memo = memo;
2258 3748368 : iter->ele_rem = iter_cnt;
2259 3748368 : iter->ele_idx = iter_start;
2260 3748368 : iter->version_lock0 = version_lock0;
2261 3748368 : iter->version_cnt = version_cnt;
2262 : /* iter->version initialized above */
2263 :
2264 3748368 : return FD_MAP_SUCCESS;
2265 :
2266 0 : fail:
2267 :
2268 : /* At this point, we hit contention acquiring the locks for
2269 : iteration. If we not blocking, tell caller to try again later.
2270 : Otherwise, backoff. See note in prepare for how this works. */
2271 :
2272 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
2273 :
2274 0 : if( FD_UNLIKELY( non_blocking ) ) {
2275 0 : iter->ele_rem = 0UL; /* make sure can't iterate */
2276 0 : iter->version_cnt = 0UL; /* make sure fini is a no-op */
2277 0 : return FD_MAP_ERR_AGAIN;
2278 0 : }
2279 :
2280 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2281 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2282 0 : MAP_(backoff)( scale, backoff_seed );
2283 0 : }
2284 : /* never get here */
2285 3748368 : }
2286 :
2287 : MAP_(iter_t) *
2288 3743982 : MAP_(iter_next)( MAP_(iter_t) * iter ) {
2289 3743982 : ulong ele_idx = iter->ele_idx;
2290 3743982 : ulong ele_rem = iter->ele_rem - 1UL;
2291 :
2292 : /* We just finished processing pair ele_idx and we have ele_rem
2293 : more pairs to process. If there is at least 1, scan for it. */
2294 :
2295 3743982 : if( ele_rem ) {
2296 0 : MAP_ELE_T * ele0 = iter->ele;
2297 0 : ulong ele_max = iter->ele_max;
2298 0 : ulong seed = iter->seed; (void)seed;
2299 0 : ulong memo = iter->memo;
2300 :
2301 0 : for(;;) {
2302 0 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2303 0 : MAP_ELE_T * ele = ele0 + ele_idx;
2304 : # if MAP_MEMOIZE
2305 : if( FD_LIKELY( ele->MAP_MEMO==memo ) ) break;
2306 : # else
2307 0 : if( FD_LIKELY( MAP_(key_hash)( &ele->MAP_KEY, seed )==memo ) ) break;
2308 0 : # endif
2309 0 : }
2310 0 : }
2311 :
2312 3743982 : iter->ele_idx = ele_idx;
2313 3743982 : iter->ele_rem = ele_rem;
2314 3743982 : return iter;
2315 3743982 : }
2316 :
2317 : MAP_STATIC int
2318 66 : MAP_(verify)( MAP_(t) const * join ) {
2319 :
2320 110952 : # define MAP_TEST(c) do { \
2321 110952 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_MAP_ERR_INVAL; } \
2322 110952 : } while(0)
2323 :
2324 : /* Validate join */
2325 :
2326 66 : MAP_TEST( join );
2327 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) );
2328 :
2329 66 : MAP_ELE_T const * ele0 = join->ele;
2330 66 : MAP_VERSION_T const * lock = join->lock;
2331 66 : ulong ele_max = join->ele_max;
2332 66 : ulong lock_cnt = join->lock_cnt;
2333 66 : ulong probe_max = join->probe_max;
2334 66 : ulong seed = join->seed;
2335 66 : int lock_shift = join->lock_shift;
2336 66 : void const * ctx = join->ctx;
2337 :
2338 66 : MAP_TEST( ele0 );
2339 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)ele0, alignof(MAP_ELE_T) ) );
2340 66 : MAP_TEST( lock );
2341 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)lock, MAP_(align)() ) );
2342 66 : MAP_TEST( fd_ulong_is_pow2( ele_max ) );
2343 66 : MAP_TEST( fd_ulong_is_pow2( lock_cnt ) );
2344 66 : MAP_TEST( lock_cnt <= fd_ulong_min( ele_max, MAP_LOCK_MAX ) );
2345 66 : MAP_TEST( (1UL<=probe_max) & (probe_max<=ele_max) );
2346 : /* seed is arbitrary */
2347 66 : MAP_TEST( (1UL<<lock_shift) == (ele_max/lock_cnt) );
2348 :
2349 : /* Validate map metadata */
2350 :
2351 66 : MAP_(shmem_t) const * map = ((MAP_(shmem_t) const *)lock)-1;
2352 :
2353 66 : MAP_TEST( map );
2354 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) );
2355 66 : MAP_TEST( map->magic == MAP_MAGIC );
2356 66 : MAP_TEST( map->ele_max == ele_max );
2357 66 : MAP_TEST( map->lock_cnt == lock_cnt );
2358 66 : MAP_TEST( map->probe_max == probe_max );
2359 66 : MAP_TEST( map->seed == seed );
2360 66 : MAP_TEST( map->lock_shift == lock_shift );
2361 :
2362 : /* Validate map elements */
2363 :
2364 270402 : for( ulong ele_idx=0UL; ele_idx<ele_max; ele_idx++ ) {
2365 270336 : MAP_ELE_T const * ele = ele0 + ele_idx;
2366 270336 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) continue; /* opt for sparse */
2367 :
2368 33546 : ulong memo = MAP_(key_hash)( &ele->MAP_KEY, seed );
2369 :
2370 : # if MAP_MEMOIZE
2371 : MAP_TEST( ele->MAP_MEMO==memo );
2372 : # endif
2373 :
2374 33546 : ulong probe_idx = memo & (ele_max-1UL);
2375 33546 : ulong probe_cnt = fd_ulong_if( ele_idx>=probe_idx, ele_idx - probe_idx, ele_max + ele_idx - probe_idx ) + 1UL;
2376 33546 : MAP_TEST( probe_cnt<=probe_max );
2377 :
2378 71622 : for( ulong probe_rem=probe_cnt; probe_rem; probe_rem-- ) {
2379 38076 : MAP_ELE_T const * probe = ele0 + probe_idx;
2380 38076 : MAP_TEST( !MAP_(private_ele_is_free)( ctx, probe ) );
2381 :
2382 38076 : int found =
2383 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
2384 : FD_LIKELY( probe->MAP_MEMO == ele->MAP_MEMO ) &&
2385 : # endif
2386 38076 : MAP_(key_eq)( &probe->MAP_KEY, &ele->MAP_KEY );
2387 :
2388 38076 : MAP_TEST( (probe_rem==1UL) ? found : !found );
2389 :
2390 38076 : probe_idx = (probe_idx+1UL) & (ele_max-1UL);
2391 38076 : }
2392 33546 : }
2393 :
2394 : /* At this point, every key in the map is reachable via it's probe
2395 : sequence and every probe sequence is at most probe_max probes long.
2396 : By extension, if a key is in the map, it will be found in at most
2397 : probe_max probes. */
2398 :
2399 66 : # undef MAP_TEST
2400 :
2401 66 : return FD_MAP_SUCCESS;
2402 66 : }
2403 :
2404 : MAP_STATIC char const *
2405 18 : MAP_(strerror)( int err ) {
2406 18 : switch( err ) {
2407 3 : case FD_MAP_SUCCESS: return "success";
2408 3 : case FD_MAP_ERR_INVAL: return "bad input";
2409 3 : case FD_MAP_ERR_AGAIN: return "try again later";
2410 3 : case FD_MAP_ERR_FULL: return "map too full";
2411 3 : case FD_MAP_ERR_KEY: return "key not found";
2412 3 : default: break;
2413 18 : }
2414 3 : return "unknown";
2415 18 : }
2416 :
2417 : #endif
2418 :
2419 : #undef MAP_
2420 : #undef MAP_STATIC
2421 :
2422 : #undef MAP_IMPL_STYLE
2423 : #undef MAP_MAGIC
2424 : #undef MAP_ALIGN
2425 : #undef MAP_LOCK_MAX
2426 : #undef MAP_VERSION_T
2427 : #undef MAP_CTX_MAX
2428 : #undef MAP_ELE_MOVE
2429 : #undef MAP_ELE_FREE
2430 : #undef MAP_ELE_IS_FREE
2431 : #undef MAP_KEY_EQ_IS_SLOW
2432 : #undef MAP_MEMO
2433 : #undef MAP_MEMOIZE
2434 : #undef MAP_KEY_HASH
2435 : #undef MAP_KEY_EQ
2436 : #undef MAP_KEY
2437 : #undef MAP_KEY_T
2438 : #undef MAP_ELE_T
2439 : #undef MAP_NAME
|