Line data Source code
1 : /* Generate prototypes, inlines and/or implementations for an ultra high
2 : performance bplus-tree-based key-val store. A bplus tree can be
3 : persisted beyond the lifetime of creating process, used concurrently,
4 : used IPC, relocated in memory, naively serialized/deserialized and/or
5 : moved between hosts. Virtually all operations on a bplus tree are a
6 : fast O(lg N) (where N is the number of elements stored) or better in
7 : worst case.
8 :
9 : At its core, this is fast binary search on a sorted array. But the
10 : sorted array has been partition into leaves where each leaf is
11 : responsible for a continuous disjoint portion of the key space and
12 : union of the ranges covered by the leaves covers the entire key
13 : space. The leaves are stored in a tree whose nodes have a large and
14 : flexible number of branches per each node that specify how leaves
15 : completely partition key space. Further, to support fast forward and
16 : reverse iteration, the leaves are organized into a sorted doubly
17 : linked list. Lastly, the interior nodes and leaves are guaranteed to
18 : be full enough that query has a fast O(lg N) worst case and have
19 : enough slack that insert / upsert / remove also have fast O(lg N)
20 : worst case.
21 :
22 : This leads to a number of improvements over textbook bplus trees,
23 : including:
24 :
25 : - Removal doesn't require nearly as much reshuffling of the interior
26 : nodes. The only requirement here is that interior nodes form a
27 : complete partitioning of the key space. (There is no requirement
28 : that interior nodes store copy of keys found in leaf nodes.)
29 :
30 : - No extra storage in the node is required for identifying whether or
31 : not an interior node or a leaf.
32 :
33 : - The leaf pair max radix can be independently tuned from the
34 : interior node pair radix (especially useful if sizing interior
35 : nodes and leaves to match things like memory page sizes).
36 :
37 : - Supports fast reverse iteration.
38 :
39 : Typical usage:
40 :
41 : struct mypair {
42 : mykey_t key;
43 :
44 : ... key can be located arbitrarily in struct (and renamed if
45 : ... needed ... see BPLUS_PAIR_KEY). It is managed by the bplus
46 : ... tree and should not be modified).
47 :
48 : ... IMPORTANT SAFETY TIP! The location of a pair can be changed
49 : ... by insert / upsert / remove operations.
50 :
51 : };
52 :
53 : typedef struct mypair mypair_t;
54 :
55 : #define BPLUS_NAME mybplus
56 : #define BPLUS_KEY_T mykey_t
57 : #define BPLUS_PAIR_T mypair_t
58 : #define BPLUS_KEY_CMP(a,b) mykeycmp( (a), (b) )
59 : #include "fd_bplus.c"
60 :
61 : will provide the following APIs as a header only style library in the
62 : compilation unit:
63 :
64 : // A myplus_t is an opaque handle to a join to a bplus tree
65 :
66 : struct mybplus_private;
67 : typedef struct mybplus_private bplus_t;
68 :
69 : // A myplus_iter_t is an opaque handle to a bplus tree iterator
70 :
71 : struct mybplus_private_iter;
72 : typedef struct mybplus_private_iter mybplus_iter_t;
73 :
74 : // Constructors
75 :
76 : // mybplus_{leaf,node}_max_est returns a conservative estimate of
77 : // the number of {leaves,nodes} needed for a worst case bplus tree
78 : // containing ele_max_est elements.
79 :
80 : ulong mybplus_leaf_max_est( ulong ele_max_est );
81 : ulong mybplus_node_max_est( ulong ele_max_est );
82 :
83 : // mybplus_{align,footprint,new,join,leave,delete} have the usual
84 : // persistent IPC object constructors / destructors semantics.
85 :
86 : ulong mybplus_align ( void );
87 : ulong mybplus_footprint( ulong node_max, ulong leaf_max );
88 :
89 : void * mybplus_new ( void * shmem, ulong node_max, ulong leaf_max );
90 : mybplus_t * mybplus_join ( void * shbplus );
91 : void * mybplus_leave ( mybplus_t * join );
92 : void * mybplus_delete( void * shbplus );
93 :
94 : // Accessors
95 :
96 : // mybplus_{node,leaf}_max return the {node,leaf}_max values used
97 : // to construct the bplus tree. Assumes join is a current local
98 : // join. Fast O(1) worst case.
99 :
100 : ulong mybplus_node_max( mybplus_t const * join );
101 : ulong mybplus_leaf_max( mybplus_t const * join );
102 :
103 : // mybplus_is_empty returns 1 if the bplus tree contains no pairs
104 : // and 0 otherwise. Assumes join is a current local join. Fast
105 : // O(1) worst case.
106 :
107 : int mybplus_is_empty( mybplus_t const * join );
108 :
109 : // mybplus_{min,max} return the pointer in the caller's local
110 : // address space to the pair in the bplus tree with the {min,max}
111 : // key. Assumes join is a current local join and bplus tree is not
112 : // empty. The lifetime of the returned pointer is the lesser of
113 : // the lifetime of the local join or the next insert / upsert /
114 : // remove operation on the bplus tree. The bplus tree retains
115 : // ownership of the returned pair and the caller should not modify
116 : // the pair key field. Fast O(1) worst case.
117 : //
118 : // mybplus_{min,max}_const is a const-correct version.
119 :
120 : mypair_t const * mybplus_min_const( mybplus_t const * join );
121 : mypair_t const * mybplus_max_const( mybplus_t const * join );
122 : mypair_t * mybplus_min ( mybplus_t * join );
123 : mypair_t * mybplus_max ( mybplus_t * join );
124 :
125 : // mybplus_query returns the pointer in the caller's local address
126 : // space to the pair in the bplus tree that matches the key pointed
127 : // to by query or NULL if there is no key matching query in the
128 : // bplus tree. Assumes join is a current local join. The lifetime
129 : // of the returned pointer is the lesser of the lifetime of the
130 : // local join or the next insert / upsert / remove operation on the
131 : // bplus tree. The bplus tree retains ownership of the returned
132 : // pair and the caller should not modify the key field. The bplus
133 : // tree has no interest in query in return. Fast O(lg N) worst
134 : // case.
135 : //
136 : // mybplus_query_const is a const-correct version.
137 :
138 : mypair_t const * mybplus_query_const( mybplus_t const * join, mykey_t const * query );
139 : mypair_t * mybplus_query( mybplus_t * join, mykey_t const * query );
140 :
141 : // Operations
142 :
143 : // mybplus_insert inserts a key into the bplus tree. Assumes join
144 : // is a current local join and key points in the caller's address
145 : // space to the key to insert. The bplus tree has no interest in
146 : // key in return.
147 : //
148 : // On success, returns the location in the caller's address space
149 : // where key was inserted. The lifetime of the returned pointer is
150 : // the lesser of the lifetime of the local join or there is an
151 : // insert / upsert / remove operation on the bplus tree. The
152 : // caller should not modify the pair key field but is free to
153 : // modify all the other values.
154 : //
155 : // On failure, returns NULL. Reasons for failure are the key was
156 : // already in the tree (locations of pairs might have changed),
157 : // there were not enough nodes (locations of pairs did not change)
158 : // or there were not enough leaves available to complete the insert
159 : // (locations of pairs did not change).
160 : //
161 : // mybplus_upsert is nearly equivalent to:
162 : //
163 : // int insert;
164 : // mypair_t * pair = mybplus_query ( join, key ); insert = 0;
165 : // if( !pair ) { pair = mybplus_insert( join, key ); insert = 1; }
166 : // if( pair && _opt_insert ) *_opt_insert = insert;
167 : //
168 : // but potentially faster as it only traverses the bplus tree once.
169 : // The "nearly" qualifier is that, unlike the above snippet, the
170 : // upsert might change the location of keys even if key is already
171 : // in the bplus tree. Fast O(lg N) worst case.
172 :
173 : mypair_t * mybplus_insert( mybplus_t * join, mykey_t const * key );
174 : mypair_t * mybplus_upsert( mybplus_t * join, mykey_t const * key, int * _opt_insert );
175 :
176 : // mybplus_remove_key removes a key from the bplus tree. Assumes
177 : // join is a current local join and key points in the caller's
178 : // address space to the key to remove. Returns 0 on success and -1
179 : // if the key was not found in the tree. The bplus tree has no
180 : // interest in key in return. Fast O(lg N) worst case.
181 :
182 : int mybplus_remove_key( mybplus_t * join, mykey_t const * key );
183 :
184 : // mybplus_remove removes the pair pointed to by pair from the
185 : // bplus tree. Assumes join is a current local join and pair is a
186 : // pointer in the caller's local address space to a pair that is
187 : // currently in the bplus tree. The pair is no longer in the bplus
188 : // tree on return. Fast O(lg N) worst case.
189 :
190 : void mybplus_remove( mybplus_t * join, mypair_t * pair );
191 :
192 : // mybplus_flush removes all pairs from the bplus tree. Assumes
193 : // join is a current local join. There are no pairs in the bplus
194 : // tree on return. Fast O( node_max + leaf_max ) worst case.
195 :
196 : void mybplus_flush( mybplus_t * join );
197 :
198 : // mybplus_verify validates the bplus tree pointed by join.
199 : // Returns 0 on success and -1 on failure (logs details).
200 : // O(node_max+leaf_max) worst case.
201 :
202 : int mybplus_verify( mybplus_t const * join );
203 :
204 : // Iteration
205 :
206 : // mybplus_iter_nul returns an iterator positioned at nul. Fast
207 : // O(1) worst case.
208 : //
209 : // mybplus_iter_min returns an iterator positioned at the min pair
210 : // or nul if the bplus is empty. Fast O(1) worst case.
211 : //
212 : // mybplus_iter_max returns an iterator positioned at the max pair
213 : // or nul if the bplus is empty. Fast O(1) worst case.
214 : //
215 : // mybplus_iter_ge returns an iterator positioned at the first pair
216 : // greater than or equal to query or at nul if all keys are less
217 : // than query. query==NULL is equivalent to "+inf". Fast O(lg N)
218 : // worst case.
219 : //
220 : // mybplus_iter_gt returns an iterator positioned at the first pair
221 : // greater than query or at nul if all keys are less than or equal
222 : // to query. query==NULL is equivalent to "+inf". Fast O(lg N)
223 : // worst case.
224 : //
225 : // mybplus_iter_le returns an iterator positioned at the last pair
226 : // less than or equal to query or at nul if all keys are greater
227 : // than query. query==NULL is equivalent to "-inf". Fast O(lg N)
228 : // worst case.
229 : //
230 : // mybplus_iter_lt returns an iterator positioned at the last pair
231 : // less than to query or at nul if all keys are greater than or
232 : // equal to query. query==NULL is equivalent to "-inf". Fast
233 : // O(lg N) worst case.
234 : //
235 : // mybplus_iter_next returns an iterator positioned at the next
236 : // pair or nul if the iterator is currently positioned at last
237 : // pair. Fast O(1) worst case.
238 : //
239 : // mybplus_iter_prev returns an iterator positioned at the previous
240 : // pair or nul if the iterator is currently positioned at first
241 : // pair. Fast O(1) worst case.
242 : //
243 : // mybplus_iter_eq returns true if iter is positioned at the same
244 : // place fini is positioned. Fast O(1) worst case.
245 : //
246 : // mybplus_iter_pair returns a pair associated with the current
247 : // iteration position. mybplus_iter_pair_const is a const correct
248 : // version. Fast O(1) worst case.
249 : //
250 : // Assumes join is a current local join and query points to a valid
251 : // key in the caller's local address space. Retains no interest in
252 : // query on return.
253 : //
254 : // Example: iterate over all pairs in ascending order:
255 : //
256 : // for( mybplus_iter_t iter = mybplus_iter_min( bplus );
257 : // !mybplus_iter_eq_nul( bplus, iter );
258 : // iter = mybplus_iter_next( bplus, iter ) ) {
259 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
260 : // ... process pair here
261 : // ... do not insert, upsert remove keys from bplus here
262 : // ... do not modify key of pair here
263 : // }
264 : //
265 : // Example: iterate over all pairs in descending order:
266 : //
267 : // for( mybplus_iter_t iter = mybplus_iter_max( bplus );
268 : // !mybplus_iter_eq_nul( bplus, iter );
269 : // iter = mybplus_iter_prev( bplus, iter ) ) {
270 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
271 : // ... process pair here
272 : // ... do not insert, upsert remove keys from bplus here
273 : // ... do not modify key of pair here
274 : // }
275 : //
276 : // Example: iterate over all pairs with keys in [key0,key1) in
277 : // ascending order (assumes key1>=key0):
278 : //
279 : // mybplus_iter_t iter = mybplus_iter_ge( bplus, key0 );
280 : // mybplus_iter_t fini = mybplus_iter_ge( bplus, key1 ); // key1==NULL will iterate over all pairs with keys >= key0
281 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
282 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
283 : //
284 : // ... process pair here
285 : // ... do not insert, upsert or remove keys from bplus here
286 : // ... do not modify key of pair here
287 : //
288 : // iter = mybplus_iter_next( bplus, iter );
289 : // }
290 : //
291 : // Example: iterate over all pairs with keys in [key0,key1] in
292 : // ascending order (assumes key1>=key0):
293 : //
294 : // mybplus_iter_t iter = mybplus_iter_ge( bplus, key0 );
295 : // mybplus_iter_t fini = mybplus_iter_gt( bplus, key1 ); // key1==NULL will iterate over all pairs with keys >= key0
296 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
297 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
298 : //
299 : // ... process pair here
300 : // ... do not insert, upsert or remove keys from bplus here
301 : // ... do not modify key of pair here
302 : //
303 : // iter = mybplus_iter_next( bplus, iter );
304 : // }
305 : //
306 : // Example: iterate over all pairs with keys in (key0,key1) in
307 : // ascending order (assumes key1>=key0):
308 : //
309 : // mybplus_iter_t iter = mybplus_iter_gt( bplus, key0 );
310 : // mybplus_iter_t fini = mybplus_iter_ge( bplus, key1 ); // key1==NULL will iterate over all pairs with keys > key0
311 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
312 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
313 : //
314 : // ... process pair here
315 : // ... do not insert, upsert or remove keys from bplus here
316 : // ... do not modify key of pair here
317 : //
318 : // iter = mybplus_iter_next( bplus, iter );
319 : // }
320 : //
321 : // Example: iterate over all pairs with keys in (key0,key1] in
322 : // ascending order (assumes key1>=key0):
323 : //
324 : // mybplus_iter_t iter = mybplus_iter_gt( bplus, key0 );
325 : // mybplus_iter_t fini = mybplus_iter_gt( bplus, key1 ); // key1==NULL will iterate over all pairs with keys > key0
326 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
327 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
328 : //
329 : // ... process pair here
330 : // ... do not insert, upsert or remove keys from bplus here
331 : // ... do not modify key of pair here
332 : //
333 : // iter = mybplus_iter_next( bplus, iter );
334 : // }
335 : //
336 : // Example: iterate over all pairs with keys in [key0,key1) in
337 : // descending order (assumes key1>=key0):
338 : //
339 : // mybplus_iter_t iter = mybplus_iter_lt( bplus, key1 );
340 : // mybplus_iter_t fini = mybplus_iter_lt( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
341 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
342 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
343 : //
344 : // ... process pair here
345 : // ... do not insert, upsert or remove keys from bplus here
346 : // ... do not modify key of pair here
347 : //
348 : // iter = mybplus_iter_prev( bplus, iter );
349 : // }
350 : //
351 : // Example: iterate over all pairs with keys in [key0,key1] in
352 : // descending order (assumes key1>=key0):
353 : //
354 : // mybplus_iter_t iter = mybplus_iter_le( bplus, key1 );
355 : // mybplus_iter_t fini = mybplus_iter_lt( bplus, key0 ); // key0==NULL will iterate over all pairs with keys <= key1
356 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
357 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
358 : //
359 : // ... process pair here
360 : // ... do not insert, upsert or remove keys from bplus here
361 : // ... do not modify key of pair here
362 : //
363 : // iter = mybplus_iter_prev( bplus, iter );
364 : // }
365 : //
366 : // Example: iterate over all pairs with keys in (key0,key1) in
367 : // descending order (assumes key1>=key0):
368 : //
369 : // mybplus_iter_t iter = mybplus_iter_lt( bplus, key1 );
370 : // mybplus_iter_t fini = mybplus_iter_le( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
371 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
372 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
373 : //
374 : // ... process pair here
375 : // ... do not insert, upsert or remove keys from bplus here
376 : // ... do not modify key of pair here
377 : //
378 : // iter = mybplus_iter_prev( bplus, iter );
379 : // }
380 : //
381 : // Example: iterate over all pairs with keys in (key0,key1] in
382 : // descending order (assumes key1>=key0):
383 : //
384 : // mybplus_iter_t iter = mybplus_iter_le( bplus, key1 );
385 : // mybplus_iter_t fini = mybplus_iter_le( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
386 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
387 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
388 : //
389 : // ... process pair here
390 : // ... do not insert, upsert or remove keys from bplus here
391 : // ... do not modify key of pair here
392 : //
393 : // iter = mybplus_iter_prev( bplus, iter );
394 : // }
395 :
396 : mybplus_iter_t mybplus_iter_nul( mybplus_t const * join );
397 : mybplus_iter_t mybplus_iter_min( mybplus_t const * join );
398 : mybplus_iter_t mybplus_iter_max( mybplus_t const * join );
399 :
400 : mybplus_iter_t mybplus_iter_ge( mybplus_t const * join, mykey_t const * query );
401 : mybplus_iter_t mybplus_iter_gt( mybplus_t const * join, mykey_t const * query );
402 : mybplus_iter_t mybplus_iter_le( mybplus_t const * join, mykey_t const * query );
403 : mybplus_iter_t mybplus_iter_lt( mybplus_t const * join, mykey_t const * query );
404 :
405 : int mybplus_iter_eq ( mybplus_t const * join, mybplus_iter_t i0, mybplus_iter_t i1 );
406 : int mybplus_iter_eq_nul( mybplus_t const * join, mybplus_iter_t iter );
407 :
408 : mybplus_iter_t mybplus_iter_next( mybplus_t const * join, mybplus_iter_t iter );
409 : mybplus_iter_t mybplus_iter_prev( mybplus_t const * join, mybplus_iter_t iter );
410 :
411 : mypair_t const * mybplus_iter_pair_const( mybplus_t const * join, mybplus_iter_t iter );
412 : mypair_t * mybplus_iter_pair ( mybplus_t * join, mybplus_iter_t iter );
413 :
414 : You can do this as often as you like in a compilation unit to get
415 : different types of bplus trees. Variants exist for making header
416 : protoypes only and/or implementations if doing a library with
417 : multiple compilation units. Further, options exist to use different
418 : hashing functions, comparison functions, etc as detailed below. */
419 :
420 : /* BPLUS_NAME gives the API prefix. */
421 :
422 : #ifndef BPLUS_NAME
423 : #error "Define BPLUS_NAME"
424 : #endif
425 :
426 : /* BPLUS_KEY_T gives the key type. Should be a plain-old-data type with
427 : a total order. */
428 :
429 : #ifndef BPLUS_KEY_T
430 : #error "Define BPLUS_KEY_T"
431 : #endif
432 :
433 : /* BPLUS_PAIR_T gives the pair type. Should be a structure of the form:
434 :
435 : typedef struct BPLUS_PAIR {
436 : BPLUS_KEY_T key; // Can be arbitrarily placed in structure, should not be modified by the user
437 : ... arbitrary user fields
438 : } BPLUS_PAIR_T;
439 :
440 : (Or the appropriate field name given BPLUS_PAIR_KEY below.) */
441 :
442 : #ifndef BPLUS_PAIR_T
443 : #error "Define BPLUS_PAIR_T"
444 : #endif
445 :
446 : /* BPLUS_PAIR_KEY gives the name of the key field in the BPLUS_PAIR_KEY.
447 : Defaults to key. */
448 :
449 : #ifndef BPLUS_PAIR_KEY
450 : #define BPLUS_PAIR_KEY key
451 : #endif
452 :
453 : /* BPLUS_KEY_CMP compares the keys pointed to by a and b and returns
454 : {<0,0,>0} if the a is {less than,equal to,greater than}. a and b
455 : will be valid pointers to key . Defaults to memcmp based. */
456 :
457 : #ifndef BPLUS_KEY_CMP
458 : #define BPLUS_KEY_CMP(a,b) memcmp( (a), (b), sizeof(*(a)) )
459 : #endif
460 :
461 : /* BPLUS_TREE_MAX is the maximum number of children a non-leaf node can
462 : have. Must be even, >=4 and <<< ULONG_MAX. Defaults to 128. */
463 :
464 : #ifndef BPLUS_TREE_MAX
465 : #define BPLUS_TREE_MAX 128
466 : #endif
467 :
468 : /* BPLUS_PAIR_MAX is the maximum number of children a leaf node can
469 : have. Must be even, >=4 and <<< ULONG_MAX. Defaults to 128. */
470 :
471 : #ifndef BPLUS_PAIR_MAX
472 : #define BPLUS_PAIR_MAX 128
473 : #endif
474 :
475 : /* BPLUS_ALIGN gives the default alignment of the BPLUS region. Should
476 : be a positive integer power of 2. Defaults to 128. */
477 :
478 : #ifndef BPLUS_ALIGN
479 12 : #define BPLUS_ALIGN 128
480 : #endif
481 :
482 : /* BPLUS_NODE_ALIGN gives the default alignment of an interior node.
483 : Should be a positive integer power of 2 of at most BPLUS_ALIGN.
484 : Defaults to 128. */
485 :
486 : #ifndef BPLUS_NODE_ALIGN
487 589152144 : #define BPLUS_NODE_ALIGN 128
488 : #endif
489 :
490 : /* BPLUS_LEAF_ALIGN gives the default alignment of a leaf node. Should
491 : be a positive integer power of 2 of at most BPLUS_ALIGN. Defaults to
492 : 128. */
493 :
494 : #ifndef BPLUS_LEAF_ALIGN
495 589152144 : #define BPLUS_LEAF_ALIGN 128
496 : #endif
497 :
498 : /* BPLUS_MAGIC is the structure magic number to use to aid in persistent
499 : and or IPC usage. */
500 :
501 : #ifndef BPLUS_MAGIC
502 3 : #define BPLUS_MAGIC (0xfdb91c53a61c0000UL) /* FD BPLUS MAGIC 0000 */
503 : #endif
504 :
505 : /* BPLUS_IMPL_STYLE indicates what this generator should output:
506 : 0 - static implementation
507 : 1 - library header
508 : 2 - library implementation */
509 :
510 : #ifndef BPLUS_IMPL_STYLE
511 : #define BPLUS_IMPL_STLYE 0
512 : #endif
513 :
514 : /**********************************************************************/
515 :
516 21022682838 : #define BPLUS_(name)FD_EXPAND_THEN_CONCAT3(BPLUS_NAME,_,name)
517 :
518 : #if BPLUS_IMPL_STYLE==0
519 : #define BPLUS_STATIC FD_FN_UNUSED static
520 : #else
521 : #define BPLUS_STATIC
522 : #endif
523 :
524 : #if BPLUS_IMPL_STYLE==0 || BPLUS_IMPL_STYLE==1
525 :
526 : /* Header *************************************************************/
527 :
528 : #include "../log/fd_log.h"
529 :
530 : struct BPLUS_(private);
531 : typedef struct BPLUS_(private) BPLUS_(t);
532 :
533 : struct BPLUS_(private_iter);
534 : typedef struct BPLUS_(private_iter) BPLUS_(iter_t);
535 :
536 : /* Internal use only */
537 :
538 : /* A bplus_private_node_t is used for finding leaves that might contain
539 : an element fast. */
540 :
541 : struct __attribute__((aligned(BPLUS_NODE_ALIGN))) BPLUS_(private_node) {
542 :
543 : /* This point is BPLUS_NODE_ALIGN aligned */
544 :
545 : ulong tree_cnt; /* if acquired, in [0,BPLUS_TREE_MAX], else ignored */
546 : ulong tree_off[ BPLUS_TREE_MAX ]; /* if acquired, indexed [0,tree_cnt), else
547 : tree_off[0]==node pool next offset (0 if last node in pool) */
548 : BPLUS_KEY_T pivot [ BPLUS_TREE_MAX-1 ]; /* if acquired, indexed [0,tree_cnt-1), else ignored */
549 :
550 : /* tree i handles keys in [ pivot[i-1], pivot[i] ), pivot[-1] /
551 : pivot[tree_cnt-1] are implied to be the previous / next pivot in an
552 : in-order traversal of the bplus tree node pivots (or -/+inf if
553 : leftmost/rightmost). */
554 : };
555 :
556 : typedef struct BPLUS_(private_node) BPLUS_(private_node_t);
557 :
558 : /* A bplus_private_leaf_t holds up to pair_cnt elements of pairs in the
559 : tree in a sorted order. */
560 :
561 : struct __attribute__((aligned(BPLUS_LEAF_ALIGN))) BPLUS_(private_leaf) {
562 :
563 : /* This point is BPLUS_LEAF_ALIGN aligned */
564 :
565 : ulong pair_cnt; /* if acquired, in [0,BPLUS_PAIR_MAX], else ignored */
566 : ulong prev_off; /* if acquired, prev leaf offset (or 0 if first leaf), else ignored */
567 : ulong next_off; /* if acquired, next leaf offset (or 0 if last leaf),
568 : else leaf pool next offset (0 if last node in pool) */
569 : BPLUS_PAIR_T pair[ BPLUS_PAIR_MAX ]; /* if acquired, indexed [0,pair_cnt), unique keys and in ascending order, else ignored */
570 : };
571 :
572 : typedef struct BPLUS_(private_leaf) BPLUS_(private_leaf_t);
573 :
574 : /* A bplus_private_t is a continguous region of memory that holds a
575 : bplus tree. Important invariants:
576 :
577 : - Empty trees have no root.
578 : - If root is a leaf, it has [1,pair_max] pairs.
579 : - If root is a node, it has [2,tree_max] trees.
580 : - Non-root nodes have [tree_min,tree_max] trees.
581 : - Non-root leaves have [pair_min,pair_max] pairs.
582 : - Children of a node are not a mix of nodes and leaves. */
583 :
584 : struct __attribute__((aligned(BPLUS_ALIGN))) BPLUS_(private) {
585 :
586 : /* This point is aligned BPLUS_ALIGN */
587 :
588 : ulong magic; /* ==BPLUS_MAGIC */
589 : ulong node_max; ulong leaf_max; /* maximum number of node/leaf in the store */
590 : ulong node_lo; ulong leaf_lo; /* offset from the first byte of bplus header to the node/leaf storage */
591 : ulong node_pool_off; ulong leaf_pool_off; /* first node/leaf in node/leaf pool, 0 if no node/leaf in pool */
592 : ulong root_off; /* offset of node/leaf to tree root (or 0 if empty) */
593 : ulong leaf_min_off; /* offset of leaf with minimum pair (or 0 if empty) */
594 : ulong leaf_max_off; /* offset of leaf with maximum pair (or 0 if empty) */
595 :
596 : /* padding to BPLUS_NODE_ALIGN here */
597 : /* node_lo points here, node_max elements, indexed [0,node_max) */
598 : /* padding to BPLUS_LEAF_ALIGN here */
599 : /* leaf_lo points here, leaf_max elements, indexed [0,leaf_max) */
600 : /* padding to BPLUS_ALIGN here */
601 :
602 : };
603 :
604 : typedef struct BPLUS_(private) BPLUS_(private_t);
605 :
606 : struct BPLUS_(private_iter) {
607 : ulong leaf_off; /* offset to current leaf */
608 : ulong pair_idx; /* current pair in current leaf */
609 : };
610 :
611 : FD_PROTOTYPES_BEGIN
612 :
613 : /* bplus_private_{pair,tree}_{min,max} return the corresponding
614 : configuration values for this bplus implementation. */
615 :
616 13161015 : FD_FN_CONST static inline ulong BPLUS_(private_pair_min)( void ) { return (ulong)(BPLUS_PAIR_MAX/2); } /* exact */
617 13161021 : FD_FN_CONST static inline ulong BPLUS_(private_pair_max)( void ) { return (ulong) BPLUS_PAIR_MAX; }
618 :
619 10361790 : FD_FN_CONST static inline ulong BPLUS_(private_tree_min)( void ) { return (ulong)(BPLUS_TREE_MAX/2); } /* exact */
620 10361769 : FD_FN_CONST static inline ulong BPLUS_(private_tree_max)( void ) { return (ulong) BPLUS_TREE_MAX; }
621 :
622 : /* bplus_private_{node,leaf}_max_max return a value for {node,leaf}_max
623 : such that the {node,leaf} storage of the bplus tree will require at
624 : most 2^62 bytes. */
625 :
626 : FD_FN_CONST static inline ulong
627 1873272 : BPLUS_(private_node_max_max)( void ) {
628 1873272 : return ((1UL<<62)-BPLUS_NODE_ALIGN+1UL) / sizeof( BPLUS_(private_node_t));
629 1873272 : }
630 :
631 : FD_FN_CONST static inline ulong
632 1873272 : BPLUS_(private_leaf_max_max)( void ) {
633 1873272 : return ((1UL<<62)-BPLUS_LEAF_ALIGN+1UL) / sizeof( BPLUS_(private_leaf_t));
634 1873272 : }
635 :
636 : /* bplus_private_key_cmp gives BPLUS_KEY_CMP the exact function
637 : signature used by the below implementations. */
638 :
639 : FD_FN_PURE static inline int
640 : BPLUS_(private_key_cmp)( BPLUS_KEY_T const * a,
641 9903640383 : BPLUS_KEY_T const * b ) {
642 9903640383 : return BPLUS_KEY_CMP(a,b);
643 9903640383 : }
644 :
645 : /* bplus_private_is_leaf returns 1 if the root of the tree at bplus
646 : global offset is a leaf or 0 if it is a node. leaf_lo is the bplus
647 : global offset of the leaf preallocated storage. Assumes tree_off and
648 : leaf_lo are valid. */
649 :
650 4652879142 : FD_FN_CONST static inline int BPLUS_(private_is_leaf)( ulong tree_off, ulong leaf_lo ) { return tree_off>=leaf_lo; }
651 :
652 : /* bplus_private returns location of the bplus private metadata in the
653 : caller's address space given a valid local join. Lifetime of the
654 : returned pointer is the lifetime of the join. bplus_private_const is
655 : a const correct version. */
656 :
657 : FD_FN_CONST static inline BPLUS_(private_t) *
658 16917414 : BPLUS_(private)( BPLUS_(t) * join ) {
659 16917414 : return (BPLUS_(private_t) *)join;
660 16917414 : }
661 :
662 : FD_FN_CONST static inline BPLUS_(private_t) const *
663 89992482 : BPLUS_(private_const)( BPLUS_(t) const * join ) {
664 89992482 : return (BPLUS_(private_t) const *)join;
665 89992482 : }
666 :
667 : /* bplus_private_{node,leaf} return the pointer in the caller's local
668 : address space of the {node,leaf} located at bplus global
669 : {node,leaf}_off. The lifetime of the returned pointer is the
670 : lifetime of the local join. Assumes bplus and node_off are valid. */
671 :
672 : FD_FN_CONST static inline BPLUS_(private_node_t) *
673 : BPLUS_(private_node)( BPLUS_(private_t) * bplus,
674 71810601 : ulong node_off ) {
675 71810601 : return (BPLUS_(private_node_t) *)((ulong)bplus + node_off);
676 71810601 : }
677 :
678 : FD_FN_CONST static inline BPLUS_(private_leaf_t) *
679 : BPLUS_(private_leaf)( BPLUS_(private_t) * bplus,
680 23130888 : ulong leaf_off ) {
681 23130888 : return (BPLUS_(private_leaf_t) *)((ulong)bplus + leaf_off);
682 23130888 : }
683 :
684 : FD_FN_CONST static inline BPLUS_(private_node_t) const *
685 : BPLUS_(private_node_const)( BPLUS_(private_t) const * bplus,
686 4274319606 : ulong node_off ) {
687 4274319606 : return (BPLUS_(private_node_t) const *)((ulong)bplus + node_off);
688 4274319606 : }
689 :
690 : FD_FN_CONST static inline BPLUS_(private_leaf_t) const *
691 : BPLUS_(private_leaf_const)( BPLUS_(private_t) const * bplus,
692 7092034104 : ulong leaf_off ) {
693 7092034104 : return (BPLUS_(private_leaf_t) const *)((ulong)bplus + leaf_off);
694 7092034104 : }
695 :
696 : /* bplus_private_off returns the bplus global offset for the given
697 : address in the caller's address space. Assumes bplus is valid and
698 : addr is non-NULL and into the bplus memory region. */
699 :
700 : FD_FN_CONST static inline ulong
701 : BPLUS_(private_off)( BPLUS_(private_t) const * bplus,
702 4347300 : void const * addr ) {
703 4347300 : return (ulong)addr - (ulong)bplus;
704 4347300 : }
705 :
706 : /* bplus_private_node_acquire acquires a node from the bplus's node pool
707 : and returns a pointer to it in the caller's address space. Assumes
708 : bplus is valid. Returns NULL if bplus node pool is empty.
709 :
710 : bplus_private_node_release releases a node to the bplus's node pool.
711 : Assumes bplus is valid, node is valid and node is not currently in
712 : the pool.
713 :
714 : Similarly for bplus_private_leaf_{acquire,release}. */
715 :
716 : static inline BPLUS_(private_node_t) *
717 232035 : BPLUS_(private_node_acquire)( BPLUS_(private_t) * bplus ) {
718 232035 : ulong node_off = bplus->node_pool_off;
719 232035 : if( FD_UNLIKELY( !node_off ) ) return NULL;
720 232035 : BPLUS_(private_node_t *) node = BPLUS_(private_node)( bplus, node_off );
721 232035 : bplus->node_pool_off = node->tree_off[0];
722 232035 : return node;
723 232035 : }
724 :
725 : static inline void
726 : BPLUS_(private_node_release)( BPLUS_(private_t) * bplus,
727 236961 : BPLUS_(private_node_t) * node ) {
728 236961 : node->tree_off[0] = bplus->node_pool_off;
729 236961 : bplus->node_pool_off = BPLUS_(private_off)( bplus, node );
730 236961 : }
731 :
732 : static inline BPLUS_(private_leaf_t) *
733 967224 : BPLUS_(private_leaf_acquire)( BPLUS_(private_t) * bplus ) {
734 967224 : ulong leaf_off = bplus->leaf_pool_off;
735 967224 : if( FD_UNLIKELY( !leaf_off ) ) return NULL;
736 967221 : BPLUS_(private_leaf_t *) leaf = BPLUS_(private_leaf)( bplus, leaf_off );
737 967221 : bplus->leaf_pool_off = leaf->next_off;
738 967221 : return leaf;
739 967224 : }
740 :
741 : static inline void
742 : BPLUS_(private_leaf_release)( BPLUS_(private_t) * bplus,
743 976146 : BPLUS_(private_leaf_t) * leaf ) {
744 976146 : leaf->next_off = bplus->leaf_pool_off;
745 976146 : bplus->leaf_pool_off = BPLUS_(private_off)( bplus, leaf );
746 976146 : }
747 :
748 : /* bplus_private_insert inserts or upserts a key into a bplus tree.
749 : Assumes join is a current local join and key points to a valid key in
750 : the caller's address space and upsert is in [0,1].
751 :
752 : upsert 0: key will inserted into bplus. On success, returns the pair
753 : where key was inserted and, on return, *_insert will be 1. Caller
754 : can update all fields in the pair except the key. Lifetime of the
755 : returned pointer is until the next insert / upsert / remove. Returns
756 : NULL if there was no room in the bplus tree or if key was already in
757 : the bplus tree (might have moved pairs around in bplus tree on
758 : failure) and _insert will be untouched.
759 :
760 : upsert 1: key will inserted or updated into bplus. If key is already
761 : present in the bplus tree, returns the location in the caller's
762 : address space of the pair with the matching key and, on return,
763 : *_insert will be 0. If not, inserts the key and requires the
764 : location in the caller's address space where pair was inserted and,
765 : on return, *_insert will be 1. In both cases, the lifetime of the
766 : returned pointer is until the next insert / upsert / remove. Returns
767 : NULL if there was no room in the bplus tree to insert (might have
768 : moved pairs around in bplus tree on failure) and _insert will be
769 : untouched.
770 :
771 : The bplus retains no interest in query on return. */
772 :
773 : BPLUS_STATIC BPLUS_PAIR_T *
774 : BPLUS_(private_insert)( BPLUS_(t) * join,
775 : BPLUS_KEY_T const * key,
776 : int upsert,
777 : int * _insert );
778 :
779 : /* bplus_private_iter returns the iterator corresponding to query and
780 : op. Assumes join is a current local join, query points to a valid
781 : key in the caller's address space or is NULL and op is in [0,3].
782 : Returns an iter positioned at:
783 :
784 : op | position
785 : -------+-------------------------------------------------------------------------------------------------------------------
786 : 0 (GE) | the first pair with a key greater than or equal to query (or nul if all have keys less than query)
787 : 1 (GT) | the first pair with a key greater than query (or nul if all have keys less than or equal to query)
788 : 2 (LE) | the last pair with a key less than or equal to query (or nul if all have keys greater than query)
789 : 3 (LT) | the last pair with a key less than query (or nul if all have keys greater than or equal to query)
790 :
791 : If query is NULL, iteration will be positioned as though:
792 :
793 : op | query
794 : -------+-------
795 : 0 (GE) | +inf
796 : 1 (GT) | +inf
797 : 2 (LE) | -inf
798 : 3 (LT) | -inf
799 :
800 : The bplus retains no interest in query on return. */
801 :
802 : FD_FN_PURE BPLUS_STATIC BPLUS_(iter_t)
803 : BPLUS_(private_iter)( BPLUS_(t) const * join,
804 : BPLUS_KEY_T const * query,
805 : int op );
806 :
807 : FD_PROTOTYPES_END
808 :
809 : /* End internal use only */
810 :
811 : FD_PROTOTYPES_BEGIN
812 :
813 : /* Constructors */
814 :
815 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(leaf_max_est)( ulong ele_max_est );
816 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(node_max_est)( ulong ele_max_est );
817 :
818 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(align) ( void );
819 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(footprint)( ulong node_max, ulong leaf_max );
820 :
821 : BPLUS_STATIC void * BPLUS_(new) ( void * shmem, ulong node_max, ulong leaf_max );
822 : BPLUS_STATIC BPLUS_(t) * BPLUS_(join) ( void * shbplus );
823 : BPLUS_STATIC void * BPLUS_(leave) ( BPLUS_(t) * join );
824 : BPLUS_STATIC void * BPLUS_(delete)( void * shbplus );
825 :
826 : /* Accessors */
827 :
828 1873254 : FD_FN_PURE static inline ulong BPLUS_(node_max)( BPLUS_(t) const * join ) { return BPLUS_(private_const)( join )->node_max; }
829 1873254 : FD_FN_PURE static inline ulong BPLUS_(leaf_max)( BPLUS_(t) const * join ) { return BPLUS_(private_const)( join )->leaf_max; }
830 :
831 16869642 : FD_FN_PURE static inline int BPLUS_(is_empty)( BPLUS_(t) const * join ) { return !BPLUS_(private_const)( join )->root_off; }
832 :
833 : FD_FN_PURE static inline BPLUS_PAIR_T const *
834 7497702 : BPLUS_(min_const)( BPLUS_(t) const * join ) {
835 7497702 : BPLUS_(t) const * bplus = BPLUS_(private_const)( join );
836 7497702 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, bplus->leaf_min_off );
837 7497702 : return &leaf->pair[0];
838 7497702 : }
839 :
840 : FD_FN_PURE static inline BPLUS_PAIR_T const *
841 7497702 : BPLUS_(max_const)( BPLUS_(t) const * join ) {
842 7497702 : BPLUS_(t) const * bplus = BPLUS_(private_const)( join );
843 7497702 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, bplus->leaf_max_off );
844 7497702 : return &leaf->pair[ leaf->pair_cnt-1UL ];
845 7497702 : }
846 :
847 3748851 : FD_FN_PURE static inline BPLUS_PAIR_T * BPLUS_(min)( BPLUS_(t) * join ) { return (BPLUS_PAIR_T *)BPLUS_(min_const)( join ); }
848 3748851 : FD_FN_PURE static inline BPLUS_PAIR_T * BPLUS_(max)( BPLUS_(t) * join ) { return (BPLUS_PAIR_T *)BPLUS_(max_const)( join ); }
849 :
850 : FD_FN_PURE BPLUS_STATIC BPLUS_PAIR_T const * BPLUS_(query_const)( BPLUS_(t) const * join, BPLUS_KEY_T const * query );
851 :
852 : FD_FN_PURE static inline BPLUS_PAIR_T *
853 : BPLUS_(query)( BPLUS_(t) * join,
854 13124622 : BPLUS_KEY_T const * query ) {
855 13124622 : return (BPLUS_PAIR_T *)BPLUS_(query_const)( join, query );
856 13124622 : }
857 :
858 : /* Operations */
859 :
860 : static inline BPLUS_PAIR_T *
861 : BPLUS_(insert)( BPLUS_(t) * join,
862 3773271 : BPLUS_KEY_T const * key ) {
863 3773271 : int dummy;
864 3773271 : return BPLUS_(private_insert)( join, key, 0, &dummy );
865 3773271 : }
866 :
867 : static inline BPLUS_PAIR_T *
868 : BPLUS_(upsert)( BPLUS_(t) * join,
869 : BPLUS_KEY_T const * key,
870 3751845 : int * _opt_insert ) {
871 3751845 : int dummy;
872 3751845 : if( !_opt_insert ) _opt_insert = &dummy; /* compile time */
873 3751845 : return BPLUS_(private_insert)( join, key, 1, _opt_insert );
874 3751845 : }
875 :
876 : BPLUS_STATIC int BPLUS_(remove_key)( BPLUS_(t) * join, BPLUS_KEY_T const * key );
877 :
878 1873158 : static inline void BPLUS_(remove)( BPLUS_(t) * join, BPLUS_PAIR_T * pair ) { BPLUS_(remove_key)( join, &pair->BPLUS_PAIR_KEY ); }
879 :
880 : BPLUS_STATIC void BPLUS_(flush)( BPLUS_(t) * join );
881 :
882 : BPLUS_STATIC int BPLUS_(verify)( BPLUS_(t) const * join );
883 :
884 : /* Iteration */
885 : /* FIXME: FD_FN_CONST for nul/eq/eq_nul/pair/pair_const? FD_FN_PURE for
886 : min/max/prev/next? */
887 :
888 : static inline BPLUS_(iter_t)
889 1875756 : BPLUS_(iter_nul)( BPLUS_(t) const * join ) {
890 1875756 : (void)join;
891 1875756 : BPLUS_(iter_t) iter;
892 1875756 : iter.leaf_off = 0UL;
893 1875756 : iter.pair_idx = 0UL;
894 1875756 : return iter;
895 1875756 : }
896 :
897 : static inline BPLUS_(iter_t)
898 1875756 : BPLUS_(iter_min)( BPLUS_(t) const * join ) {
899 1875756 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
900 1875756 : ulong leaf_off = bplus->leaf_min_off;
901 1875756 : BPLUS_(iter_t) iter;
902 1875756 : iter.leaf_off = leaf_off;
903 1875756 : iter.pair_idx = 0UL;
904 1875756 : return iter;
905 1875756 : }
906 :
907 : static inline BPLUS_(iter_t)
908 1875756 : BPLUS_(iter_max)( BPLUS_(t) const * join ) {
909 1875756 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
910 1875756 : ulong leaf_off = bplus->leaf_max_off;
911 1875756 : BPLUS_(iter_t) iter;
912 1875756 : iter.leaf_off = leaf_off;
913 1875756 : iter.pair_idx = (FD_UNLIKELY( !leaf_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, leaf_off )->pair_cnt) - 1UL;
914 1875756 : return iter;
915 1875756 : }
916 :
917 : FD_FN_PURE static inline BPLUS_(iter_t)
918 : BPLUS_(iter_ge)( BPLUS_(t) const * join,
919 3749115 : BPLUS_KEY_T const * query ) {
920 3749115 : return BPLUS_(private_iter)( join, query, 0 );
921 3749115 : }
922 :
923 : FD_FN_PURE static inline BPLUS_(iter_t)
924 : BPLUS_(iter_gt)( BPLUS_(t) const * join,
925 3749115 : BPLUS_KEY_T const * query ) {
926 3749115 : return BPLUS_(private_iter)( join, query, 1 );
927 3749115 : }
928 :
929 : FD_FN_PURE static inline BPLUS_(iter_t)
930 : BPLUS_(iter_le)( BPLUS_(t) const * join,
931 3749115 : BPLUS_KEY_T const * query ) {
932 3749115 : return BPLUS_(private_iter)( join, query, 2 );
933 3749115 : }
934 :
935 : FD_FN_PURE static inline BPLUS_(iter_t)
936 : BPLUS_(iter_lt)( BPLUS_(t) const * join,
937 3749115 : BPLUS_KEY_T const * query ) {
938 3749115 : return BPLUS_(private_iter)( join, query, 3 );
939 3749115 : }
940 :
941 : static inline int
942 : BPLUS_(iter_eq)( BPLUS_(t) const * join,
943 : BPLUS_(iter_t) iter,
944 11251554 : BPLUS_(iter_t) fini ) {
945 11251554 : (void)join;
946 11251554 : return (iter.leaf_off==fini.leaf_off) & (iter.pair_idx==fini.pair_idx);
947 11251554 : }
948 :
949 : static inline int
950 : BPLUS_(iter_eq_nul)( BPLUS_(t) const * join,
951 14998770 : BPLUS_(iter_t) iter ) {
952 14998770 : (void)join;
953 14998770 : return !iter.leaf_off;
954 14998770 : }
955 :
956 : __attribute__((warn_unused_result))
957 : static inline BPLUS_(iter_t)
958 : BPLUS_(iter_next)( BPLUS_(t) const * join,
959 3751176 : BPLUS_(iter_t) iter ) {
960 3751176 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
961 :
962 3751176 : ulong leaf_off = iter.leaf_off;
963 3751176 : ulong pair_idx = iter.pair_idx;
964 :
965 3751176 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
966 :
967 3751176 : pair_idx++;
968 3751176 : if( FD_UNLIKELY( pair_idx>=leaf->pair_cnt ) ) { /* optimize for high radix */
969 3751176 : leaf_off = leaf->next_off;
970 3751176 : pair_idx = 0UL;
971 3751176 : }
972 :
973 3751176 : iter.leaf_off = leaf_off;
974 3751176 : iter.pair_idx = pair_idx;
975 3751176 : return iter;
976 3751176 : }
977 :
978 : __attribute__((warn_unused_result))
979 : static inline BPLUS_(iter_t)
980 : BPLUS_(iter_prev)( BPLUS_(t) const * join,
981 1875681 : BPLUS_(iter_t) iter ) {
982 1875681 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
983 :
984 1875681 : ulong leaf_off = iter.leaf_off;
985 1875681 : ulong pair_idx = iter.pair_idx;
986 :
987 1875681 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
988 :
989 1875681 : if( FD_UNLIKELY( !pair_idx ) ) { /* optimize for high radix */
990 1875681 : leaf_off = leaf->prev_off;
991 1875681 : pair_idx = FD_UNLIKELY( !leaf_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, leaf_off )->pair_cnt;
992 1875681 : }
993 1875681 : pair_idx--;
994 :
995 1875681 : iter.leaf_off = leaf_off;
996 1875681 : iter.pair_idx = pair_idx;
997 1875681 : return iter;
998 1875681 : }
999 :
1000 : static inline BPLUS_PAIR_T const *
1001 : BPLUS_(iter_pair_const)( BPLUS_(t) const * join,
1002 11243061 : BPLUS_(iter_t) iter ) {
1003 11243061 : return BPLUS_(private_leaf_const)( BPLUS_(private_const)( join ), iter.leaf_off )->pair + iter.pair_idx;
1004 11243061 : }
1005 :
1006 : static inline BPLUS_PAIR_T *
1007 : BPLUS_(iter_pair)( BPLUS_(t) * join,
1008 3751362 : BPLUS_(iter_t) iter ) {
1009 3751362 : return BPLUS_(private_leaf)( BPLUS_(private)( join ), iter.leaf_off )->pair + iter.pair_idx;
1010 3751362 : }
1011 :
1012 : FD_PROTOTYPES_END
1013 :
1014 : #endif
1015 :
1016 : #if BPLUS_IMPL_STYLE==0 || BPLUS_IMPL_STYLE==2
1017 :
1018 : /* Implementation *****************************************************/
1019 :
1020 : /* bplus_private_node_query returns the index of a node's child tree,
1021 : in [0,tree_cnt), that might contain query.
1022 :
1023 : tree 0 covers keys [ -inf, pivot[0] )
1024 : i covers keys [ pivot[i-1], pivot[i] )
1025 : tree_cnt-1 covers keys [ pivot[tree_cnt-2], +inf )
1026 :
1027 : Assumes pivot contains unique keys in ascending order, tree_cnt is in
1028 : [2,tree_max], tree_max <<< ULONG_MAX and query is valid. */
1029 :
1030 : FD_FN_PURE static ulong
1031 : BPLUS_(private_node_query)( BPLUS_KEY_T const * FD_RESTRICT pivot,
1032 : ulong tree_cnt,
1033 242154210 : BPLUS_KEY_T const * FD_RESTRICT query ) {
1034 242154210 : ulong i0 = 0UL;
1035 242154210 : ulong i1 = tree_cnt;
1036 :
1037 452318847 : do {
1038 :
1039 : /* At this point, query might be found in trees in [i0,i1) and this
1040 : range contains at least two trees. Test the middle tree. If it
1041 : matches exactly, we are done. Otherwise, recurse on the
1042 : appropriate half of the range. */
1043 :
1044 452318847 : ulong im = (i0+i1) >> 1; /* No overflow, at least 1 */
1045 :
1046 452318847 : int cmp = BPLUS_(private_key_cmp)( query, &pivot[im-1UL] );
1047 452318847 : if( FD_UNLIKELY( !cmp ) ) return im; /* (optional) early abort, optimize for big trees */
1048 447066750 : i0 = fd_ulong_if( cmp<0, i0, im );
1049 447066750 : i1 = fd_ulong_if( cmp<0, im, i1 );
1050 :
1051 447066750 : } while( FD_LIKELY( (i1-i0)>1UL) ); /* optimize for big trees */
1052 :
1053 236902113 : return i0;
1054 242154210 : }
1055 :
1056 : /* bplus_private_pair_query returns the index of a leaf's pair, in
1057 : [0,pair_cnt), that exactly matches query or pair if there is no
1058 : matching pair. Assumes pair keys are unique and ascending sorted,
1059 : pair_cnt is in [1,pair_max], pair_max <<< ULONG_MAX and query is
1060 : valid. */
1061 :
1062 : FD_FN_PURE static ulong
1063 : BPLUS_(private_pair_query)( BPLUS_PAIR_T const * FD_RESTRICT pair,
1064 : ulong pair_cnt,
1065 22530249 : BPLUS_KEY_T const * FD_RESTRICT query ) {
1066 22530249 : ulong i0 = 0UL;
1067 22530249 : ulong i1 = pair_cnt;
1068 :
1069 36757224 : do {
1070 :
1071 : /* At this point, query might match one of the pairs in [i0,i1) and
1072 : this range is not empty. Test the pair in the middle. If it
1073 : matches, we found the pair. Otherwise, recurse appropriate half
1074 : of the range (exclusive of our query). */
1075 :
1076 36757224 : ulong im = (i0+i1) >> 1; /* No overflow */
1077 :
1078 36757224 : int cmp = BPLUS_(private_key_cmp)( query, &pair[im].BPLUS_PAIR_KEY );
1079 36757224 : if( FD_UNLIKELY( !cmp ) ) return im; /* Found, optimize for big trees */
1080 23600799 : i0 = fd_ulong_if( cmp<0, i0, im+1UL );
1081 23600799 : i1 = fd_ulong_if( cmp<0, im, i1 );
1082 :
1083 23600799 : } while( FD_LIKELY( i1-i0 ) ); /* optimize for big trees */
1084 :
1085 9373824 : return pair_cnt; /* not found */
1086 22530249 : }
1087 :
1088 : /* bplus_private_child_insert inserts a child at position child_idx into
1089 : parent. Parent should have a tree_cnt in [1,tree_max-1] and
1090 : child_idx should be in [1,tree_cnt] (such that the child is never
1091 : inserted into a parent with no children or a parent with the maximum
1092 : number of children and is never inserted as the first born child).
1093 : child_off is the bplus global offset of the child. This can be a
1094 : node or leaf but it should match parent's current children.
1095 : child_pivot is the pivot value associated with the child and the
1096 : child_idx should preserve the parent's pivot sorting. Further, child
1097 : should not contain any keys that outside the parent's pivot range
1098 : after the insert. */
1099 :
1100 : static void
1101 : BPLUS_(private_child_insert)( BPLUS_(private_node_t) * FD_RESTRICT parent,
1102 : ulong child_idx,
1103 : ulong child_off,
1104 1198194 : BPLUS_KEY_T const * FD_RESTRICT child_pivot ) {
1105 1198194 : ulong tree_cnt = parent->tree_cnt;
1106 1198194 : ulong * FD_RESTRICT tree_off = parent->tree_off;
1107 1198194 : BPLUS_KEY_T * FD_RESTRICT pivot = parent->pivot;
1108 :
1109 : /* Make room for child at child_idx by shifting childen currently at
1110 : or after child_idx up one. */
1111 :
1112 2885352 : for( ulong sibling_idx=tree_cnt; sibling_idx>child_idx; sibling_idx-- ) {
1113 1687158 : tree_off[sibling_idx ] = tree_off[sibling_idx-1UL];
1114 1687158 : pivot [sibling_idx-1UL] = pivot [sibling_idx-2UL];
1115 1687158 : }
1116 :
1117 : /* Insert the child at child_idx */
1118 :
1119 1198194 : tree_off[child_idx ] = child_off;
1120 1198194 : pivot [child_idx-1UL] = child_pivot[0];
1121 :
1122 1198194 : parent->tree_cnt = tree_cnt + 1UL; /* In [2,tree_max] */
1123 1198194 : }
1124 :
1125 : /* bplus_private_child_remove removes the child child_idx from the bplus
1126 : node parent. Assumes parent is valid with a tree cnt in [2,tree_max]
1127 : and that child is in [1,tree_cnt) (as such, this will never remove
1128 : the first born child). */
1129 :
1130 : static void
1131 : BPLUS_(private_child_remove)( BPLUS_(private_node_t) * FD_RESTRICT parent,
1132 1193622 : ulong child_idx ) {
1133 1193622 : ulong tree_cnt = parent->tree_cnt;
1134 1193622 : ulong * FD_RESTRICT tree_off = parent->tree_off;
1135 1193622 : BPLUS_KEY_T * FD_RESTRICT pivot = parent->pivot;
1136 :
1137 : /* Fill the hole at child_idx by shifting childen currently at or
1138 : after child_idx down one. */
1139 :
1140 1193622 : tree_cnt--;
1141 2689209 : for( ulong sibling_idx=child_idx; sibling_idx<tree_cnt; sibling_idx++ ) {
1142 1495587 : tree_off[sibling_idx ] = tree_off[sibling_idx+1UL];
1143 1495587 : pivot [sibling_idx-1UL] = pivot [sibling_idx ];
1144 1495587 : }
1145 :
1146 1193622 : parent->tree_cnt = tree_cnt; /* In [1,tree_max-1] */
1147 1193622 : }
1148 :
1149 : ulong
1150 18 : BPLUS_(leaf_max_est)( ulong ele_max_est ) {
1151 :
1152 : /* No leaves needed for always empty trees */
1153 :
1154 18 : if( FD_UNLIKELY( !ele_max_est ) ) return 0UL;
1155 :
1156 : /* Trivial bplus trees have just a root leaf */
1157 :
1158 12 : if( FD_UNLIKELY( ele_max_est<=BPLUS_(private_pair_max)() ) ) return 1UL;
1159 :
1160 : /* In a non-trivial bplus tree, each leaf has at least
1161 : pair_min==pair_max/2 elements. So, we require:
1162 :
1163 : leaf_max*pair_min >= ele_max_est
1164 : -> leaf_max >= ele_max_est / pair_min
1165 :
1166 : The smallest leaf_max that satisfies this is:
1167 :
1168 : ceil( ele_max_est / pair_min )
1169 : -> floor( (ele_max_est + pair_min - 1) / pair_min )
1170 : -> 1 + floor( (ele_max_est - 1) / pair_min */
1171 :
1172 6 : return 1UL + ((ele_max_est-1UL) / BPLUS_(private_pair_min)()); /* No overflow */
1173 12 : }
1174 :
1175 : ulong
1176 9 : BPLUS_(node_max_est)( ulong ele_max_est ) {
1177 :
1178 : /* Start at the leaf layer with leaf_max trees */
1179 :
1180 9 : ulong node_max = 0UL;
1181 9 : ulong tree_cnt = BPLUS_(leaf_max_est)( ele_max_est );
1182 :
1183 30 : while( tree_cnt>1UL ) {
1184 :
1185 : /* At this point, we have more than one tree in the current layer.
1186 : To reduce the number of trees, we create a new layer of nodes
1187 : above it and make each new node responsible for up to
1188 : tree_min==tree_max/2 of the trees in the current layer to give a
1189 : reasonably tight bound to the worst case. That implies this new
1190 : layer will need at most:
1191 :
1192 : ceil( tree_cnt / tree_min )
1193 : -> floor( (tree_cnt + tree_min - 1) / tree_min )
1194 : -> 1 + floor( (tree_cnt - 1) / tree_min )
1195 :
1196 : nodes and this layer will reduce to the number of trees to the
1197 : same number. */
1198 :
1199 21 : tree_cnt = 1UL + ((tree_cnt-1UL) / BPLUS_(private_tree_min)()); /* No overflow */
1200 21 : node_max += tree_cnt;
1201 :
1202 21 : }
1203 :
1204 9 : return node_max;
1205 9 : }
1206 :
1207 : ulong
1208 3 : BPLUS_(align)( void ) {
1209 3 : return BPLUS_ALIGN;
1210 3 : }
1211 :
1212 : ulong
1213 : BPLUS_(footprint)( ulong node_max,
1214 18 : ulong leaf_max ) {
1215 :
1216 18 : if( FD_UNLIKELY( (node_max > BPLUS_(private_node_max_max)()) | (leaf_max > BPLUS_(private_leaf_max_max)()) ) ) return 0UL;
1217 :
1218 : /* At this point, the needed node and leaf storage is at most 2^63,
1219 : which is impractically large but also with plenty of room left over
1220 : for the metadata and remaining alignment padding. */
1221 :
1222 6 : ulong off = 0UL; /**/ off += sizeof( BPLUS_(private_t) );
1223 6 : off = fd_ulong_align_up( off, BPLUS_NODE_ALIGN ); /*ulong node_lo = off;*/ off += node_max*sizeof( BPLUS_(private_node_t) );
1224 6 : off = fd_ulong_align_up( off, BPLUS_LEAF_ALIGN ); /*ulong leaf_lo = off;*/ off += leaf_max*sizeof( BPLUS_(private_leaf_t) );
1225 6 : off = fd_ulong_align_up( off, BPLUS_ALIGN );
1226 :
1227 6 : return off;
1228 18 : }
1229 :
1230 : void
1231 6 : BPLUS_(flush)( BPLUS_(t) * bplus ) {
1232 6 : bplus->node_pool_off = 0UL;
1233 6 : bplus->leaf_pool_off = 0UL;
1234 6 : bplus->root_off = 0UL;
1235 6 : bplus->leaf_min_off = 0UL;
1236 6 : bplus->leaf_max_off = 0UL;
1237 :
1238 6 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, bplus->node_lo );
1239 6162 : for( ulong node_rem=bplus->node_max; node_rem; node_rem-- ) BPLUS_(private_node_release)( bplus, &node[ node_rem-1UL ] );
1240 :
1241 6 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, bplus->leaf_lo );
1242 12294 : for( ulong leaf_rem=bplus->leaf_max; leaf_rem; leaf_rem-- ) BPLUS_(private_leaf_release)( bplus, &leaf[ leaf_rem-1UL ] );
1243 6 : }
1244 :
1245 : void *
1246 : BPLUS_(new)( void * shmem,
1247 : ulong node_max,
1248 15 : ulong leaf_max ) {
1249 15 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shmem;
1250 :
1251 15 : if( FD_UNLIKELY( !bplus ) ) {
1252 3 : FD_LOG_WARNING(( "NULL shmem" ));
1253 3 : return NULL;
1254 3 : }
1255 :
1256 12 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1257 3 : FD_LOG_WARNING(( "misaligned shmem" ));
1258 3 : return NULL;
1259 3 : }
1260 :
1261 9 : ulong footprint = BPLUS_(footprint)( node_max, leaf_max );
1262 9 : if( FD_UNLIKELY( !footprint ) ) {
1263 6 : FD_LOG_WARNING(( "bad node_max and/or leaf_max" ));
1264 6 : return NULL;
1265 6 : }
1266 :
1267 : /* Note: it is the caller's responsibility to clear the memory because
1268 : it is potentially very big and very time consuming to do so and may
1269 : already have been cleared (e.g. mmap from the OS) */
1270 :
1271 3 : ulong off;
1272 3 : off = 0UL; /**/ off += sizeof( BPLUS_(private_t) );
1273 3 : off = fd_ulong_align_up( off, BPLUS_NODE_ALIGN ); ulong node_lo = off; off += node_max*sizeof( BPLUS_(private_node_t) );
1274 3 : off = fd_ulong_align_up( off, BPLUS_LEAF_ALIGN ); ulong leaf_lo = off; off += leaf_max*sizeof( BPLUS_(private_leaf_t) );
1275 3 : off = fd_ulong_align_up( off, BPLUS_ALIGN );
1276 :
1277 3 : bplus->node_max = node_max; bplus->leaf_max = leaf_max;
1278 3 : bplus->node_lo = node_lo; bplus->leaf_lo = leaf_lo;
1279 :
1280 3 : BPLUS_(flush)( bplus );
1281 :
1282 3 : FD_COMPILER_MFENCE();
1283 3 : bplus->magic = BPLUS_MAGIC;
1284 3 : FD_COMPILER_MFENCE();
1285 :
1286 3 : return shmem;
1287 9 : }
1288 :
1289 : BPLUS_(t) *
1290 12 : BPLUS_(join)( void * shbplus ) {
1291 12 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shbplus;
1292 :
1293 12 : if( FD_UNLIKELY( !bplus ) ) {
1294 3 : FD_LOG_WARNING(( "NULL shbplus" ));
1295 3 : return NULL;
1296 3 : }
1297 :
1298 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1299 3 : FD_LOG_WARNING(( "misaligned shbplus" ));
1300 3 : return NULL;
1301 3 : }
1302 :
1303 6 : if( FD_UNLIKELY( bplus->magic!=BPLUS_MAGIC ) ) {
1304 3 : FD_LOG_WARNING(( "bad magic" ));
1305 3 : return NULL;
1306 3 : }
1307 :
1308 3 : return (BPLUS_(t) *)bplus;
1309 6 : }
1310 :
1311 : void *
1312 6 : BPLUS_(leave)( BPLUS_(t) * join ) {
1313 6 : if( FD_UNLIKELY( !join ) ) {
1314 3 : FD_LOG_WARNING(( "NULL join" ));
1315 3 : return NULL;
1316 3 : }
1317 :
1318 3 : return (void *)join;
1319 6 : }
1320 :
1321 : void *
1322 12 : BPLUS_(delete)( void * shbplus ) {
1323 12 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shbplus;
1324 :
1325 12 : if( FD_UNLIKELY( !bplus ) ) {
1326 3 : FD_LOG_WARNING(( "NULL shbplus" ));
1327 3 : return NULL;
1328 3 : }
1329 :
1330 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1331 3 : FD_LOG_WARNING(( "misaligned shbplus" ));
1332 3 : return NULL;
1333 3 : }
1334 :
1335 6 : if( FD_UNLIKELY( bplus->magic!=BPLUS_MAGIC ) ) {
1336 3 : FD_LOG_WARNING(( "bad magic" ));
1337 3 : return NULL;
1338 3 : }
1339 :
1340 3 : FD_COMPILER_MFENCE();
1341 3 : bplus->magic = 0UL;
1342 3 : FD_COMPILER_MFENCE();
1343 :
1344 3 : return (void *)bplus;
1345 6 : }
1346 :
1347 : BPLUS_PAIR_T const *
1348 : BPLUS_(query_const)( BPLUS_(t) const * join,
1349 16889784 : BPLUS_KEY_T const * query ) {
1350 16889784 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
1351 :
1352 : /* If an empty bplus tree, not found */
1353 :
1354 16889784 : ulong tree_off = bplus->root_off;
1355 16889784 : if( FD_UNLIKELY( !tree_off ) ) return NULL; /* optimize for big trees */
1356 :
1357 : /* At this point, the bplus tree is not empty. Find the leaf that
1358 : might contain query. */
1359 :
1360 16889409 : ulong leaf_lo = bplus->leaf_lo;
1361 107671575 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* optimize for big trees */
1362 90782166 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
1363 90782166 : tree_off = node->tree_off[ BPLUS_(private_node_query)( node->pivot, node->tree_cnt, query ) ];
1364 90782166 : }
1365 16889409 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
1366 :
1367 : /* At this point, leaf might contain query. Query the leaf */
1368 :
1369 16889409 : pair_t const * pair = leaf->pair;
1370 16889409 : ulong pair_cnt = leaf->pair_cnt;
1371 16889409 : ulong pair_idx = BPLUS_(private_pair_query)( pair, pair_cnt, query );
1372 :
1373 16889409 : return fd_ptr_if( pair_idx<pair_cnt, &pair[ pair_idx ], NULL );
1374 16889784 : }
1375 :
1376 : BPLUS_PAIR_T *
1377 : BPLUS_(private_insert)( BPLUS_(t) * join,
1378 : BPLUS_KEY_T const * key,
1379 : int upsert,
1380 7525116 : int * _insert ) {
1381 7525116 : BPLUS_(private_t) * bplus = BPLUS_(private)( join );
1382 :
1383 : /* If the bplus tree is empty, create the root leaf and insert the key
1384 : into it */
1385 :
1386 7525116 : ulong tree_off = bplus->root_off;
1387 7525116 : if( FD_UNLIKELY( !tree_off ) ) { /* Empty bplus, optimize for big */
1388 :
1389 189 : BPLUS_(private_leaf_t) * root = BPLUS_(private_leaf_acquire)( bplus );
1390 189 : if( FD_UNLIKELY( !root ) ) return NULL; /* no room for insert */
1391 :
1392 189 : root->prev_off = 0UL;
1393 189 : root->next_off = 0UL;
1394 189 : root->pair_cnt = 1UL;
1395 189 : root->pair[0].BPLUS_PAIR_KEY = key[0];
1396 189 : ulong root_off = BPLUS_(private_off)( bplus, root );
1397 189 : bplus->root_off = root_off;
1398 189 : bplus->leaf_min_off = root_off;
1399 189 : bplus->leaf_max_off = root_off;
1400 :
1401 189 : *_insert = 1;
1402 189 : return &root->pair[0];
1403 :
1404 189 : }
1405 :
1406 : /* At this point, the bplus tree is not empty. We recurse through
1407 : interior nodes to find the leaf that should hold key, splitting
1408 : interior nodes as we go. */
1409 :
1410 7524927 : ulong tree_min = BPLUS_(private_tree_min)();
1411 7524927 : ulong tree_max = BPLUS_(private_tree_max)(); /* ==tree_min*2 */
1412 :
1413 7524927 : BPLUS_(private_node_t) * parent = NULL;
1414 7524927 : ulong child_idx = 0UL;
1415 :
1416 7524927 : ulong leaf_lo = bplus->leaf_lo;
1417 47962443 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* Optimize for big trees */
1418 40437516 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, tree_off );
1419 :
1420 : /* At this point, we should insert key into one of the node's trees
1421 : and tree_cnt is in [2,tree_max] (root) or [tree_min,tree_max]
1422 : (non-root). If the node has a parent, parent and all node's
1423 : siblings are nodes and parent has in [2,tree_max-1] (root parent)
1424 : or [tree_min,tree_max-1] (non-root parent) children. (tree_max-1
1425 : because if it had tree_max children when insert started, we would
1426 : have split it on the previous iteration).
1427 :
1428 : If the node is full, split it. */
1429 :
1430 40437516 : ulong tree_cnt = node->tree_cnt;
1431 40437516 : if( FD_UNLIKELY( tree_cnt==tree_max ) ) { /* Optimize for high radix */
1432 :
1433 : /* Acquire resources. If node is the root, this includes making a
1434 : new root node and making the new root node's parent. */
1435 :
1436 231162 : BPLUS_(private_node_t) * new_node = BPLUS_(private_node_acquire)( bplus );
1437 231162 : if( FD_UNLIKELY( !new_node ) ) return NULL; /* No room for insert */
1438 :
1439 231162 : if( FD_UNLIKELY( !parent ) ) {
1440 714 : parent = BPLUS_(private_node_acquire)( bplus );
1441 714 : if( FD_UNLIKELY( !parent ) ) {
1442 0 : BPLUS_(private_node_release)( bplus, new_node );
1443 0 : return NULL; /* No room for insert */
1444 0 : }
1445 :
1446 714 : bplus->root_off = BPLUS_(private_off)( bplus, parent );
1447 :
1448 714 : parent->tree_cnt = 1UL; /* Will be incremented to 2 by the child_insert below. */
1449 714 : parent->tree_off[0] = BPLUS_(private_off)( bplus, node );
1450 :
1451 714 : child_idx = 0UL;
1452 714 : }
1453 :
1454 : /* At this point, node is child child_idx of parent and we need to
1455 : split node. Further, new_node is the node that will be created
1456 : by the split and parent has room to insert a link to new_node.
1457 : Split node evenly into new_node and update the parent
1458 : accordingly. */
1459 :
1460 231162 : BPLUS_KEY_T const * median = &node->pivot[ tree_min-1UL ];
1461 :
1462 231162 : node->tree_cnt = tree_min;
1463 :
1464 231162 : new_node->tree_cnt = tree_min;
1465 231162 : memcpy( new_node->tree_off, node->tree_off + tree_min, sizeof(ulong) * tree_min );
1466 231162 : memcpy( new_node->pivot, node->pivot + tree_min, sizeof(BPLUS_KEY_T)*(tree_min-1UL) );
1467 :
1468 231162 : BPLUS_(private_child_insert)( parent, child_idx+1UL, BPLUS_(private_off)( bplus, new_node ), median );
1469 :
1470 : /* Move into the appropriate split */
1471 :
1472 231162 : node = fd_ptr_if( BPLUS_(private_key_cmp)( key, median )<0, node, new_node );
1473 231162 : tree_cnt = tree_min;
1474 231162 : }
1475 :
1476 : /* At this point, we should insert key into one of the node's trees
1477 : and tree_cnt is in [2,tree_max-1] (root) or [tree_min,tree_max-1]
1478 : (non root) such that we are guaranteed to be able to insert. */
1479 :
1480 40437516 : parent = node;
1481 40437516 : child_idx = BPLUS_(private_node_query)( node->pivot, tree_cnt, key );
1482 40437516 : tree_off = node->tree_off[ child_idx ];
1483 40437516 : }
1484 :
1485 7524927 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, tree_off );
1486 :
1487 : /* At this point, we'd like to insert key into leaf. But if leaf is
1488 : full, we split it to make room. */
1489 :
1490 7524927 : ulong pair_min = BPLUS_(private_pair_min)();
1491 7524927 : ulong pair_max = BPLUS_(private_pair_max)(); /* ==pair_min*2 */
1492 :
1493 7524927 : ulong pair_cnt = (ulong)leaf->pair_cnt;
1494 7524927 : if( FD_UNLIKELY( pair_cnt==pair_max ) ) { /* optimize for high radix */
1495 :
1496 : /* Acquire resources. If leaf is the root, this includes making a
1497 : new root node and making the new root node's parent. */
1498 :
1499 967035 : BPLUS_(private_leaf_t) * new_leaf = BPLUS_(private_leaf_acquire)( bplus );
1500 967035 : if( FD_UNLIKELY( !new_leaf ) ) return NULL; /* No room for insert */
1501 :
1502 967032 : if( FD_UNLIKELY( !parent ) ) {
1503 159 : parent = BPLUS_(private_node_acquire)( bplus );
1504 159 : if( FD_UNLIKELY( !parent ) ) {
1505 0 : BPLUS_(private_leaf_release)( bplus, new_leaf );
1506 0 : return NULL; /* No room to insert */
1507 0 : }
1508 :
1509 159 : bplus->root_off = BPLUS_(private_off)( bplus, parent );
1510 :
1511 159 : parent->tree_cnt = 1UL; /* Will be incremented to 2 below */
1512 159 : parent->tree_off[0] = BPLUS_(private_off)( bplus, leaf );
1513 :
1514 159 : child_idx = 0UL;
1515 159 : }
1516 :
1517 : /* At this point, leaf is child child_idx of parent and we need to
1518 : split leaf. Further, new_leaf is the leaf that will be created
1519 : by the split and parent has room to insert a link to new_leaf.
1520 : Split leaf evenly into new_leaf and update the parent
1521 : accordingly. Splitting this leaf might make a new max leaf (it
1522 : will never make a new min leaf). */
1523 :
1524 967032 : BPLUS_KEY_T const * median = &leaf->pair[ pair_min ].BPLUS_PAIR_KEY;
1525 :
1526 967032 : ulong next_off = leaf->next_off;
1527 :
1528 967032 : leaf->pair_cnt = pair_min;
1529 967032 : leaf->next_off = BPLUS_(private_off)( bplus, new_leaf );
1530 :
1531 967032 : new_leaf->pair_cnt = pair_min;
1532 967032 : new_leaf->prev_off = BPLUS_(private_off)( bplus, leaf );
1533 967032 : new_leaf->next_off = next_off;
1534 967032 : memcpy( &new_leaf->pair[0], &leaf->pair[pair_min], sizeof(pair_t)*pair_min );
1535 :
1536 : /* FIXME: BRANCHLESS? */
1537 967032 : ulong new_leaf_off = BPLUS_(private_off)( bplus, new_leaf );
1538 967032 : if( FD_UNLIKELY( !next_off ) ) bplus->leaf_max_off = new_leaf_off;
1539 963618 : else BPLUS_(private_leaf)( bplus, next_off )->prev_off = new_leaf_off;
1540 :
1541 967032 : BPLUS_(private_child_insert)( parent, child_idx+1UL, new_leaf_off, median );
1542 :
1543 : /* Move into the appropriate split */
1544 :
1545 967032 : leaf = (BPLUS_(private_key_cmp)( key, median )<0) ? leaf : new_leaf;
1546 967032 : pair_cnt = pair_min;
1547 967032 : }
1548 :
1549 : /* At this point, leaf either contains key or is where we should
1550 : insert key. Further, pair_cnt is in [1,pair_max-1] (root) or
1551 : [pair_min,pair_max-1] (non root). Search for key in the leaf. If
1552 : key is not in the leaf, the search will reveal where to put the
1553 : key. */
1554 :
1555 7524924 : BPLUS_PAIR_T * pair = leaf->pair;
1556 7524924 : ulong i0 = 0UL;
1557 7524924 : ulong i1 = pair_cnt;
1558 12506436 : do {
1559 :
1560 : /* At this point, pairs in [0,i0) are before key, pairs in
1561 : [i1,pair_cnt) are after key and pairs in [i0,i1) (non-empty) are
1562 : not known. Probe the middle of this range for key. */
1563 :
1564 12506436 : ulong im = (i0+i1) >> 1; /* no overflow */
1565 :
1566 12506436 : int cmp = BPLUS_(private_key_cmp)( &pair[ im ].BPLUS_PAIR_KEY, key );
1567 :
1568 : /* If cmp==0, pair im holds the key and we are done. Otherwise, if
1569 : cmp<0 / cmp>0, pair im is before / after key. We adjust the
1570 : ranges appropriately and recurse. */
1571 :
1572 12506436 : if( FD_UNLIKELY( !cmp ) ) { /* optimize for big trees */
1573 3752997 : leaf->pair_cnt = pair_cnt;
1574 3752997 : if( !upsert ) return NULL; /* compile time */
1575 1875972 : *_insert = 0;
1576 1875972 : return &pair[ im ];
1577 3752997 : }
1578 8753439 : i0 = fd_ulong_if( cmp>0, i0, im+1UL );
1579 8753439 : i1 = fd_ulong_if( cmp>0, im, i1 );
1580 :
1581 8753439 : } while( i1>i0 );
1582 :
1583 : /* At this point, leaf does not contain key, pairs [0,i0) are before
1584 : key, pairs [i0,pair_cnt) are after key and we have room for key.
1585 : Move pairs [i0,pair_cnt) right 1 to make room and insert the key at
1586 : pair i0. */
1587 :
1588 3771927 : memmove( pair+i0+1UL, pair+i0, (pair_cnt-i0)*sizeof(BPLUS_PAIR_T) );
1589 3771927 : pair[ i0 ].BPLUS_PAIR_KEY = key[0];
1590 3771927 : leaf->pair_cnt = pair_cnt + 1UL;
1591 3771927 : *_insert = 1;
1592 3771927 : return &pair[ i0 ];
1593 7524924 : }
1594 :
1595 : int
1596 : BPLUS_(remove_key)( BPLUS_(t) * join,
1597 5640936 : BPLUS_KEY_T const * key ) {
1598 5640936 : BPLUS_(private_t) * bplus = BPLUS_(private)( join );
1599 :
1600 : /* If tree is empty, nothing to remove */
1601 :
1602 5640936 : ulong tree_off = bplus->root_off;
1603 5640936 : if( FD_UNLIKELY( !tree_off ) ) return -1; /* not found, optimize for found */
1604 :
1605 : /* At this point, the tree is not empty. Find the path through the
1606 : tree to the leaf with the key to remove. Note that 128 is more
1607 : than enough given strong lg N depth algorithmic guarantees and wide
1608 : radices. */
1609 :
1610 5640840 : BPLUS_(private_node_t) * path_node [ 128 ];
1611 5640840 : ulong path_tree_idx[ 128 ];
1612 5640840 : ulong path_cnt = 0UL;
1613 :
1614 5640840 : ulong leaf_lo = bplus->leaf_lo;
1615 35960184 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* optimize for big trees */
1616 30319344 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, tree_off );
1617 :
1618 30319344 : ulong tree_idx = BPLUS_(private_node_query)( node->pivot, node->tree_cnt, key );
1619 :
1620 30319344 : path_node [ path_cnt ] = node;
1621 30319344 : path_tree_idx[ path_cnt ] = tree_idx;
1622 30319344 : path_cnt++;
1623 :
1624 30319344 : tree_off = node->tree_off[ tree_idx ];
1625 30319344 : }
1626 :
1627 5640840 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, tree_off );
1628 :
1629 : /* At this point, leaf might contain key. Search for key. */
1630 :
1631 5640840 : BPLUS_PAIR_T * pair = leaf->pair;
1632 5640840 : ulong pair_cnt = leaf->pair_cnt;
1633 5640840 : ulong pair_idx = BPLUS_(private_pair_query)( pair, pair_cnt, key );
1634 :
1635 5640840 : if( FD_UNLIKELY( pair_idx>=pair_cnt ) ) return -1; /* not found, optimize for found */
1636 :
1637 : /* At this point, pair[ pair_idx ] is the pair to remove. Remove it. */
1638 :
1639 3763473 : pair_cnt--;
1640 6947751 : for( ulong idx=pair_idx; idx<pair_cnt; idx++ ) pair[idx] = pair[idx+1UL];
1641 3763473 : leaf->pair_cnt = pair_cnt; /* FIXME: MOVE BELOW? */
1642 :
1643 : /* At this point, the leaf might be unbalanced but everything else in
1644 : the bplus tree is balanced. */
1645 :
1646 3763473 : if( FD_UNLIKELY( !path_cnt ) ) { /* optimize for big trees */
1647 :
1648 : /* At this point, we removed a pair from the root leaf and the
1649 : leaf's pair_cnt is in [0,pair_max-1] . If there are still pairs
1650 : in the leaf, the bplus tree is still balanced and we are done.
1651 : Otherwise, we release the leaf and make an empty bplus tree
1652 : (which is balanced by definition). */
1653 :
1654 561 : if( FD_LIKELY( pair_cnt ) ) return 0; /* optimize for big trees */
1655 186 : bplus->root_off = 0UL;
1656 186 : bplus->leaf_min_off = 0UL;
1657 186 : bplus->leaf_max_off = 0UL;
1658 186 : BPLUS_(private_leaf_release)( bplus, leaf );
1659 186 : return 0;
1660 :
1661 561 : }
1662 :
1663 : /* At this point, we removed a pair from a non-root leaf and the
1664 : leaf's pair_cnt is in [pair_min-1,pair_max-1]. If there are at
1665 : least pair_min pairs left in the leaf, the bplus tree is still
1666 : balanced and we are done. */
1667 :
1668 3762912 : ulong pair_min = BPLUS_(private_pair_min)();
1669 3762912 : ulong pair_max = BPLUS_(private_pair_max)();
1670 :
1671 3762912 : if( FD_LIKELY( pair_cnt>=pair_min ) ) return 0; /* optimize for big trees */
1672 :
1673 : /* At this point, we removed a pair from a non-root leaf and its
1674 : pair_cnt is pair_min-1. As such, it is not balanced with its
1675 : siblings (leaf must have at least leaf_min-1 siblings that must
1676 : also be leaves with a pair_cnt in [pair_min,pair_max]). Determine
1677 : which sibling to use for rebalancing and how to rebalance with this
1678 : sibling. This sibling will have a pair cnt in [pair_min,pair_max].
1679 :
1680 : Note: Could be more adaptive here (e.g. pick the larger sibling
1681 : when leaf is a middle child). */
1682 :
1683 1660878 : path_cnt--;
1684 1660878 : BPLUS_(private_node_t) * parent = path_node [ path_cnt ];
1685 1660878 : ulong child_idx = path_tree_idx[ path_cnt ];
1686 :
1687 1660878 : ulong sib0_idx = child_idx - (ulong)(child_idx>0UL);
1688 1660878 : ulong sib1_idx = sib0_idx + 1UL;
1689 :
1690 1660878 : ulong sib0_off = parent->tree_off[ sib0_idx ];
1691 1660878 : ulong sib1_off = parent->tree_off[ sib1_idx ];
1692 :
1693 1660878 : BPLUS_(private_leaf_t) * sib0 = BPLUS_(private_leaf)( bplus, sib0_off );
1694 1660878 : BPLUS_(private_leaf_t) * sib1 = BPLUS_(private_leaf)( bplus, sib1_off );
1695 :
1696 1660878 : ulong sib0_pair_cnt = sib0->pair_cnt;
1697 1660878 : ulong sib1_pair_cnt = sib1->pair_cnt;
1698 :
1699 1660878 : ulong reb_pair_cnt = sib0_pair_cnt + sib1_pair_cnt; /* in [pair_max-1,2*pair_max-1]. */
1700 1660878 : if( FD_LIKELY( reb_pair_cnt>=pair_max ) ) {
1701 :
1702 : /* At this point, reb_pair_cnt is in [pair_max,2*pair_max-1].
1703 : Divide these as evenly as possible between sib0 and sib1 and
1704 : update the parent's pivot accordingly. Since we do not remove
1705 : any trees from the parent, this will rebalance the whole bplus
1706 : tree fully and we are done. */
1707 :
1708 697206 : ulong new_sib0_pair_cnt = reb_pair_cnt >> 1;
1709 697206 : ulong new_sib1_pair_cnt = reb_pair_cnt - new_sib0_pair_cnt;
1710 :
1711 697206 : if( new_sib0_pair_cnt>sib0_pair_cnt ) { /* Shift pairs from sib1 into sib0 */
1712 :
1713 167931 : ulong delta = new_sib0_pair_cnt - sib0_pair_cnt;
1714 167931 : memcpy ( sib0->pair + sib0_pair_cnt, sib1->pair, sizeof(BPLUS_PAIR_T)*delta );
1715 167931 : memmove( sib1->pair, sib1->pair + delta, sizeof(BPLUS_PAIR_T)*new_sib1_pair_cnt );
1716 :
1717 529275 : } else { /* Shift pairs from sib0 into sib1 */
1718 :
1719 529275 : ulong delta = sib0_pair_cnt - new_sib0_pair_cnt;
1720 529275 : memmove( sib1->pair + delta, sib1->pair, sizeof(BPLUS_PAIR_T)*sib1_pair_cnt );
1721 529275 : memcpy ( sib1->pair, sib0->pair + new_sib0_pair_cnt, sizeof(BPLUS_PAIR_T)*delta );
1722 :
1723 529275 : }
1724 :
1725 697206 : sib0->pair_cnt = new_sib0_pair_cnt;
1726 697206 : sib1->pair_cnt = new_sib1_pair_cnt;
1727 :
1728 697206 : parent->pivot[sib0_idx] = sib1->pair[0].BPLUS_PAIR_KEY;
1729 697206 : return 0;
1730 697206 : }
1731 :
1732 : /* At this point, reb_pair_cnt is pair_max-1 such that these siblings
1733 : must be merged to restore balance among the leaves. This might
1734 : change the leaf max from sib1 to sib0. */
1735 :
1736 963672 : memcpy( sib0->pair + sib0_pair_cnt, sib1->pair, sizeof(BPLUS_PAIR_T)*sib1_pair_cnt );
1737 963672 : sib0->pair_cnt = reb_pair_cnt;
1738 :
1739 963672 : ulong sib2_off = sib1->next_off;
1740 963672 : sib0->next_off = sib2_off;
1741 :
1742 : /* FIXME: DO BRANCHLESS? */
1743 963672 : if( FD_UNLIKELY( !sib2_off ) ) bplus->leaf_max_off = sib0_off;
1744 961158 : else BPLUS_(private_leaf)( bplus, sib2_off )->prev_off = sib0_off;
1745 :
1746 963672 : BPLUS_(private_child_remove)( parent, sib1_idx );
1747 963672 : BPLUS_(private_leaf_release)( bplus, sib1 );
1748 :
1749 : /* The merge might have unbalance parent among its siblings. If it
1750 : has not, we are done. Otherwise, we rebalance parent among its
1751 : siblings. That might unbalance the grandparent among its siblings.
1752 : And so on along the path potentially all the back to the bplus tree
1753 : root. */
1754 :
1755 963672 : ulong tree_min = BPLUS_(private_tree_min)();
1756 963672 : ulong tree_max = BPLUS_(private_tree_max)();
1757 :
1758 1193622 : while( FD_LIKELY( path_cnt ) ) { /* optimize for big trees */
1759 1189584 : BPLUS_(private_node_t) * child = parent;
1760 :
1761 : /* At this point, because we just removed a tree from child, child's
1762 : tree_cnt is in [tree_min-1,tree_max-1] but everything else is
1763 : balanced. If the child has at least tree_min trees, the bplus
1764 : tree is still balanced. */
1765 :
1766 1189584 : ulong child_tree_cnt = child->tree_cnt;
1767 1189584 : if( FD_LIKELY( child_tree_cnt>=tree_min ) ) return 0; /* optimize for big trees */
1768 :
1769 : /* At this point, child's tree_cnt is tree_min-1. As such, it is
1770 : not balanced with its siblings (child must have at least
1771 : leaf_min-1 siblings that must also be nodes with a tree_cnt in
1772 : [tree_min,tree_max]). Determine which sibling to use for
1773 : rebalancing and how to rebalance.
1774 :
1775 : Note: Could be more adaptive here (e.g. pick the larger sibling
1776 : if a middle child). */
1777 :
1778 410850 : path_cnt--;
1779 410850 : parent = path_node [ path_cnt ];
1780 410850 : child_idx = path_tree_idx[ path_cnt ];
1781 :
1782 410850 : ulong sib0_idx = child_idx - (ulong)(child_idx>0UL);
1783 410850 : ulong sib1_idx = sib0_idx + 1UL;
1784 :
1785 410850 : ulong sib0_off = parent->tree_off[ sib0_idx ];
1786 410850 : ulong sib1_off = parent->tree_off[ sib1_idx ];
1787 :
1788 410850 : BPLUS_(private_node_t) * sib0 = BPLUS_(private_node)( bplus, sib0_off );
1789 410850 : BPLUS_(private_node_t) * sib1 = BPLUS_(private_node)( bplus, sib1_off );
1790 :
1791 410850 : ulong sib0_tree_cnt = sib0->tree_cnt;
1792 410850 : ulong sib1_tree_cnt = sib1->tree_cnt;
1793 :
1794 410850 : ulong reb_tree_cnt = sib0_tree_cnt + sib1_tree_cnt; /* in [tree_max-1,2*tree_max-1]. */
1795 410850 : if( FD_LIKELY( reb_tree_cnt>=tree_max ) ) {
1796 :
1797 : /* At this point, reb_tree_cnt is in [tree_max,2*tree_max-1].
1798 : Divide these as evenly as possible between sib0 and sib1 and
1799 : update the parent's pivot accordingly. Since we do not remove
1800 : any trees from parent, this will rebalance the whole bplus tree
1801 : and we are done. */
1802 :
1803 180900 : ulong new_sib0_tree_cnt = reb_tree_cnt >> 1;
1804 180900 : ulong new_sib1_tree_cnt = reb_tree_cnt - new_sib0_tree_cnt;
1805 :
1806 180900 : if( new_sib0_tree_cnt>sib0_tree_cnt ) { /* Shift leading sib1 trees to trailing sib0 trees */
1807 :
1808 43557 : ulong delta = new_sib0_tree_cnt - sib0_tree_cnt;
1809 43557 : memcpy ( sib0->tree_off + sib0_tree_cnt, sib1->tree_off, sizeof(ulong)*delta );
1810 43557 : memmove( sib1->tree_off, sib1->tree_off + delta, sizeof(ulong)*new_sib1_tree_cnt );
1811 :
1812 : /* Copy parent pivot and leading delta-1 sib1 pivots into sib0. */
1813 :
1814 43557 : sib0->pivot[ sib0_tree_cnt-1UL ] = parent->pivot[ sib0_idx ];
1815 43557 : memcpy( sib0->pivot + sib0_tree_cnt, sib1->pivot, (delta-1UL)*sizeof(BPLUS_KEY_T) );
1816 :
1817 : /* At this point, there is 1 hole in the parent pivots and
1818 : delta-1 holes in the leading sib1 pivots. Copy the next sib1
1819 : pivot to the parent. */
1820 :
1821 43557 : parent->pivot[ sib0_idx ] = sib1->pivot[ delta-1UL ];
1822 :
1823 : /* At this point, there are delta holes in the leading sib1
1824 : pivots. Shift remaining sib1 pivots down delta. */
1825 :
1826 43557 : memmove( sib1->pivot, sib1->pivot+delta, (new_sib1_tree_cnt-1UL)*sizeof(BPLUS_KEY_T) );
1827 :
1828 137343 : } else { /* Shift trailing sib0 trees to leading sib1 trees */
1829 :
1830 137343 : ulong delta = sib0_tree_cnt - new_sib0_tree_cnt;
1831 137343 : memmove( sib1->tree_off + delta, sib1->tree_off, sizeof(ulong)*sib1_tree_cnt );
1832 137343 : memcpy ( sib1->tree_off, sib0->tree_off + new_sib0_tree_cnt, sizeof(ulong)*delta );
1833 :
1834 : /* Shift sib1 pivots up delta. */
1835 :
1836 137343 : memmove( sib1->pivot+delta, sib1->pivot, (sib1_tree_cnt-1UL)*sizeof(BPLUS_KEY_T) );
1837 :
1838 : /* At this point, there are delta holes in the leading sib1
1839 : pivots. Copy trailing delta-1 sib0 pivots and parent pivot
1840 : into sib1. */
1841 :
1842 137343 : memcpy( sib1->pivot, sib0->pivot+new_sib0_tree_cnt, (delta-1UL)*sizeof(BPLUS_KEY_T) );
1843 137343 : sib1->pivot[ delta-1UL ] = parent->pivot[ sib0_idx ];
1844 :
1845 : /* At this point, there is 1 hole in the parent pivot. Copy
1846 : trailing sib0 pivot into parent. */
1847 :
1848 137343 : parent->pivot[ sib0_idx ] = sib0->pivot[ new_sib0_tree_cnt-1UL ];
1849 :
1850 137343 : }
1851 :
1852 180900 : sib0->tree_cnt = new_sib0_tree_cnt;
1853 180900 : sib1->tree_cnt = new_sib1_tree_cnt;
1854 180900 : return 0;
1855 180900 : }
1856 :
1857 : /* At this point, reb_tree_cnt is tree_max-1 such that these
1858 : siblings must be merged to restore balance among siblings. Since
1859 : this might unbalance parent relative to its siblings, we need to
1860 : keep iterating. */
1861 :
1862 229950 : memcpy( sib0->tree_off + sib0_tree_cnt, sib1->tree_off, sizeof(ulong)*sib1_tree_cnt );
1863 :
1864 229950 : sib0->pivot[ sib0_tree_cnt-1UL ] = parent->pivot[ sib0_idx ];
1865 229950 : memcpy( sib0->pivot + sib0_tree_cnt, sib1->pivot, sizeof(BPLUS_KEY_T)*(sib1_tree_cnt-1UL) );
1866 :
1867 229950 : sib0->tree_cnt = reb_tree_cnt;
1868 :
1869 229950 : BPLUS_(private_child_remove)( parent, sib1_idx );
1870 :
1871 229950 : BPLUS_(private_node_release)( bplus, sib1 );
1872 229950 : }
1873 :
1874 : /* At this point, parent is the root node and we just removed a tree
1875 : from it. If parent still has more than 1 tree, the bplus tree is
1876 : balanced and we are done. Otherwise, we make parent's sole child
1877 : the new root and release parent to finish balancing the tree. */
1878 :
1879 4038 : if( FD_LIKELY( parent->tree_cnt>1UL ) ) return 0; /* optimize for big trees */
1880 :
1881 855 : bplus->root_off = parent->tree_off[ 0 ];
1882 855 : BPLUS_(private_node_release)( bplus, parent );
1883 855 : return 0;
1884 4038 : }
1885 :
1886 : BPLUS_(iter_t)
1887 : BPLUS_(private_iter)( BPLUS_(t) const * join,
1888 : BPLUS_KEY_T const * query,
1889 14996460 : int op ) {
1890 14996460 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
1891 :
1892 14996460 : BPLUS_(iter_t) iter;
1893 :
1894 : /* If the bplus is empty or query is NULL, return nul */
1895 :
1896 14996460 : ulong tree_off = bplus->root_off;
1897 14996460 : if( FD_UNLIKELY( (!tree_off) | (!query) ) ) { /* empty, optimize for big trees */
1898 348 : iter.leaf_off = 0UL;
1899 348 : iter.pair_idx = 0UL;
1900 348 : return iter;
1901 348 : }
1902 :
1903 : /* At this point, the bplus is not empty. Find the leaf that might
1904 : contain query. */
1905 :
1906 14996112 : ulong leaf_lo = bplus->leaf_lo;
1907 95611296 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* Optimize for big trees */
1908 80615184 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
1909 80615184 : tree_off = node->tree_off[ BPLUS_(private_node_query)( node->pivot, node->tree_cnt, query ) ];
1910 80615184 : }
1911 14996112 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
1912 :
1913 : /* At this point, pairs in the previous leaf (if any) have keys less
1914 : than query and pairs in the next leaf (if any) have keys greater
1915 : than query. Search the leaf for query. */
1916 :
1917 14996112 : BPLUS_PAIR_T const * pair = leaf->pair;
1918 14996112 : ulong pair_cnt = leaf->pair_cnt;
1919 :
1920 14996112 : ulong i0 = 0UL;
1921 14996112 : ulong i1 = pair_cnt;
1922 :
1923 23399328 : do {
1924 :
1925 : /* At this point, the range [i0,i1) contains at least 1 pair. Pairs
1926 : [0,i0) have keys less than query, pairs [i1,pair_cnt) have keys
1927 : greater than query and we don't know about pairs [i0,i1). Test
1928 : the pair in the middle.
1929 :
1930 : If this pair's key matches query, because all keys are unique, we
1931 : know that pair im is the first pair greater than or equal to
1932 : query and that pair im+1 is the first pair greater than query.
1933 :
1934 : If this pair's key is greater than query, we know all pairs in
1935 : [im,pair_cnt) are greater than query so we update i1 to im.
1936 :
1937 : If this pair's key is less than query, we know that all pairs in
1938 : [0,im+1) are less than query so we update i0 to im+1. */
1939 :
1940 23399328 : ulong im = (i0+i1) >> 1; /* No overflow */
1941 :
1942 23399328 : int cmp = BPLUS_(private_key_cmp)( &pair[im].BPLUS_PAIR_KEY, query );
1943 23399328 : if( FD_UNLIKELY( !cmp ) ) { /* optimize for big trees */
1944 :
1945 : /* At this point, pairs [0,im) have keys less than query, pair im
1946 : key matches query and pairs (im,pair_cnt) are greater than
1947 : query. If:
1948 :
1949 : op==0 (GE): pick i0 == im such that [0,i0) are < query and [i0,pair_cnt) are >= query
1950 : op==1 (GT): pick i0 == im+1 such that [0,i0) are <= query and [i0,pair_cnt) are > query
1951 : op==2 (LE): pick i0 == im+1 such that [0,i0) are <= query and [i0,pair_cnt) are > query
1952 : op==3 (LT): pick i0 == im such that [0,i0) are < query and [i0,pair_cnt) are >= query */
1953 :
1954 7494132 : i0 = im + (ulong)((op==1) | (op==2)); /* compile time */
1955 7494132 : break;
1956 7494132 : }
1957 15905196 : i0 = fd_ulong_if( cmp>0, i0, im+1UL );
1958 15905196 : i1 = fd_ulong_if( cmp>0, im, i1 );
1959 :
1960 15905196 : } while( FD_LIKELY( i1-i0 ) ); /* optimize for big trees */
1961 :
1962 : /* At this point:
1963 :
1964 : op==0 (GE): pairs [i0,pair_cnt) have keys greater than or equal to query
1965 : op==1 (GT): pairs [i0,pair_cnt) have keys greater than query
1966 : op==2 (LE): pairs [0,i0) have keys less than or equal to query
1967 : op==3 (LT): pairs [0,i0) have keys less than query */
1968 :
1969 14996112 : if( op<=1 ) { /* compile time */
1970 :
1971 7498056 : if( FD_UNLIKELY( i0==pair_cnt ) ) { /* optimize for big trees */
1972 :
1973 : /* At this point:
1974 :
1975 : op==0 (GE): all pairs have keys less than query and pairs in any next leaf have keys greater than query
1976 : op==1 (GT): all pairs have keys less than or equal to query and pairs in any next leaf have keys greater than query
1977 :
1978 : position iterator at first pair in next leaf (or nul if this is
1979 : the max leaf). */
1980 :
1981 4491171 : tree_off = leaf->next_off;
1982 4491171 : i0 = 0UL;
1983 4491171 : }
1984 :
1985 7498056 : } else {
1986 :
1987 7498056 : if( FD_UNLIKELY( i0==0UL ) ) { /* optimize for big trees */
1988 :
1989 : /* At this point:
1990 :
1991 : op==2 (LE): all pairs have keys greater than query and pairs in any prev leaf have keys less than query
1992 : op==3 (LT): all pairs have keys greater than or equal to query and pairs in any prev leaf have keys less than query
1993 :
1994 : position iterator at last pair in previous leaf (or nul if this
1995 : is the min leaf). */
1996 :
1997 742896 : tree_off = leaf->prev_off;
1998 742896 : i0 = FD_UNLIKELY( !tree_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, tree_off )->pair_cnt;
1999 742896 : }
2000 7498056 : i0--;
2001 :
2002 7498056 : }
2003 :
2004 14996112 : iter.leaf_off = tree_off;
2005 14996112 : iter.pair_idx = i0;
2006 14996112 : return iter;
2007 14996460 : }
2008 :
2009 : int
2010 1873254 : BPLUS_(verify)( BPLUS_(t) const * join ) {
2011 :
2012 55111332036 : # define BPLUS_TEST(c) do { \
2013 52928495214 : if( FD_UNLIKELY( !(c) ) ) { \
2014 0 : FD_LOG_WARNING(( "FAIL: %s", #c )); \
2015 0 : return -1; \
2016 0 : } \
2017 52928495214 : } while(0)
2018 :
2019 : /* Verify join */
2020 :
2021 1873254 : BPLUS_TEST( join );
2022 :
2023 1873254 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
2024 :
2025 1873254 : BPLUS_TEST( fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) );
2026 :
2027 : /* Verify header */
2028 :
2029 1873254 : BPLUS_TEST( bplus->magic==BPLUS_MAGIC );
2030 :
2031 1873254 : ulong node_max = bplus->node_max;
2032 1873254 : ulong leaf_max = bplus->leaf_max;
2033 :
2034 1873254 : BPLUS_TEST( node_max<=BPLUS_(private_node_max_max)() );
2035 1873254 : BPLUS_TEST( leaf_max<=BPLUS_(private_leaf_max_max)() );
2036 :
2037 1873254 : ulong node_lo = bplus->node_lo;
2038 1873254 : ulong leaf_lo = bplus->leaf_lo;
2039 :
2040 1873254 : BPLUS_TEST( node_lo==fd_ulong_align_up( sizeof( BPLUS_(private_t) ), BPLUS_NODE_ALIGN ) );
2041 1873254 : BPLUS_TEST( leaf_lo==fd_ulong_align_up( node_lo + node_max*sizeof( BPLUS_(private_node_t) ), BPLUS_LEAF_ALIGN ) );
2042 :
2043 1873254 : ulong node_hi = node_lo + node_max*sizeof( BPLUS_(private_node_t) );
2044 1873254 : ulong leaf_hi = leaf_lo + leaf_max*sizeof( BPLUS_(private_leaf_t) );
2045 :
2046 1873254 : ulong root_off = bplus->root_off;
2047 1873254 : ulong leaf_min_off = bplus->leaf_min_off;
2048 1873254 : ulong leaf_max_off = bplus->leaf_max_off;
2049 :
2050 1873254 : if( FD_LIKELY( root_off ) ) {
2051 :
2052 1873170 : BPLUS_TEST( node_lo<=root_off ); BPLUS_TEST( root_off<leaf_hi );
2053 1873170 : BPLUS_TEST( fd_ulong_is_aligned( root_off, fd_ulong_if( !BPLUS_(private_is_leaf)( root_off, leaf_lo ),
2054 1873170 : BPLUS_NODE_ALIGN, BPLUS_LEAF_ALIGN ) ) );
2055 :
2056 1873170 : BPLUS_TEST( leaf_lo<=leaf_min_off ); BPLUS_TEST( leaf_min_off<leaf_hi );
2057 1873170 : BPLUS_TEST( fd_ulong_is_aligned( leaf_min_off, BPLUS_LEAF_ALIGN ) );
2058 :
2059 1873170 : BPLUS_TEST( leaf_lo<=leaf_max_off ); BPLUS_TEST( leaf_max_off<leaf_hi );
2060 1873170 : BPLUS_TEST( fd_ulong_is_aligned( leaf_max_off, BPLUS_LEAF_ALIGN ) );
2061 :
2062 1873170 : } else {
2063 :
2064 84 : BPLUS_TEST( !leaf_min_off );
2065 84 : BPLUS_TEST( !leaf_max_off );
2066 :
2067 84 : }
2068 :
2069 1873254 : ulong node_rem = bplus->node_max;
2070 1873254 : ulong leaf_rem = bplus->leaf_max;
2071 :
2072 : /* Verify node pool */
2073 :
2074 1873254 : ulong node_off = bplus->node_pool_off;
2075 1336552995 : while( FD_LIKELY( node_off ) ) {
2076 1334679741 : BPLUS_TEST( node_rem ); node_rem--;
2077 1334679741 : BPLUS_TEST( node_lo<=node_off ); BPLUS_TEST( node_off<node_hi );
2078 1334679741 : BPLUS_TEST( fd_ulong_is_aligned( node_off, BPLUS_NODE_ALIGN ) );
2079 1334679741 : node_off = BPLUS_(private_node_const)( bplus, node_off )->tree_off[0];
2080 1334679741 : }
2081 :
2082 : /* Verify leaf pool */
2083 :
2084 1873254 : ulong leaf_off = bplus->leaf_pool_off;
2085 2242739487 : while( FD_LIKELY( leaf_off ) ) {
2086 2240866233 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2087 2240866233 : BPLUS_TEST( leaf_lo<=leaf_off ); BPLUS_TEST( leaf_off<leaf_hi );
2088 2240866233 : BPLUS_TEST( fd_ulong_is_aligned( leaf_off, BPLUS_LEAF_ALIGN ) );
2089 2240866233 : leaf_off = BPLUS_(private_leaf_const)( bplus, leaf_off )->next_off;
2090 2240866233 : }
2091 :
2092 : /* Verify the actual tree */
2093 :
2094 1873254 : ulong leaf_cnt = leaf_rem;
2095 :
2096 1873254 : if( FD_LIKELY( root_off ) ) { /* optimize for big trees */
2097 :
2098 : /* At this point, the tree is not empty */
2099 :
2100 1873170 : ulong tree_min = BPLUS_(private_tree_min)();
2101 1873170 : ulong tree_max = BPLUS_(private_tree_max)();
2102 :
2103 1873170 : ulong pair_min = BPLUS_(private_pair_min)();
2104 1873170 : ulong pair_max = BPLUS_(private_pair_max)();
2105 :
2106 1873170 : ulong stack_tree_off [ 128 ];
2107 1873170 : ulong stack_subtree_idx[ 128 ];
2108 1873170 : BPLUS_KEY_T const * stack_key_lo [ 128 ];
2109 1873170 : BPLUS_KEY_T const * stack_key_hi [ 128 ];
2110 1873170 : ulong stack_cnt = 0UL;
2111 1873170 : ulong stack_max = 128UL;
2112 :
2113 1873170 : ulong tree_off = root_off;
2114 1873170 : ulong subtree_idx = 0UL;
2115 1873170 : BPLUS_KEY_T const * key_lo = NULL;
2116 1873170 : BPLUS_KEY_T const * key_hi = NULL;
2117 :
2118 3776521611 : for(;;) {
2119 :
2120 : /* At this point, we are still validating the tree rooted at
2121 : tree_off and this tree should contain only keys in
2122 : [key_lo,key_hi). key_{lo,hi}==NULL indicates key_{lo,hi} is
2123 : {-inf,+inf}.
2124 :
2125 : If tree is a node, we've validated all of tree's subtrees
2126 : [0,subtree_idx). subtree_idx==0 indicates this is the first
2127 : time we've visited this node.
2128 :
2129 : If tree is a leaf, as we only visit each leaf exactly once,
2130 : subtree_idx will be zero (and otherwise ignored). */
2131 :
2132 3776521611 : if( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* tree is a node */
2133 :
2134 : /* If this is the first time visiting this node, validate it */
2135 :
2136 2180963652 : if( FD_UNLIKELY( !subtree_idx ) ) {
2137 :
2138 : /* Validate no loops */
2139 :
2140 587278863 : BPLUS_TEST( node_rem ); node_rem--;
2141 :
2142 : /* Validate the node pointer */
2143 :
2144 587278863 : BPLUS_TEST( node_lo<=tree_off ); BPLUS_TEST( tree_off<node_hi );
2145 587278863 : BPLUS_TEST( fd_ulong_is_aligned( tree_off, BPLUS_NODE_ALIGN ) );
2146 :
2147 587278863 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
2148 :
2149 587278863 : BPLUS_KEY_T const * subtree_pivot = node->pivot;
2150 587278863 : ulong const * subtree_off = node->tree_off;
2151 587278863 : ulong subtree_cnt = node->tree_cnt;
2152 :
2153 : /* Validate the node tree count */
2154 :
2155 587278863 : BPLUS_TEST( fd_ulong_if( tree_off!=root_off, tree_min, 2UL )<=subtree_cnt );
2156 587278863 : BPLUS_TEST( subtree_cnt<=tree_max );
2157 :
2158 : /* Validate the node tree offsets */
2159 :
2160 587278863 : int is_leaf = BPLUS_(private_is_leaf)( subtree_off[0], leaf_lo );
2161 :
2162 587278863 : ulong lo = fd_ulong_if( is_leaf, leaf_lo, node_lo );
2163 587278863 : ulong hi = fd_ulong_if( is_leaf, leaf_hi, node_hi );
2164 587278863 : ulong align = fd_ulong_if( is_leaf, BPLUS_LEAF_ALIGN, BPLUS_NODE_ALIGN );
2165 :
2166 2768242515 : for( ulong idx=0UL; idx<subtree_cnt; idx++ ) {
2167 2180963652 : ulong off = subtree_off[ idx ];
2168 2180963652 : BPLUS_TEST( lo<=off ); BPLUS_TEST( off<hi );
2169 2180963652 : BPLUS_TEST( fd_ulong_is_aligned( off, align ) );
2170 2180963652 : }
2171 :
2172 : /* Validate the node pivots */
2173 :
2174 587278863 : if( FD_LIKELY( key_lo ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( key_lo, &subtree_pivot[0] )<0 );
2175 :
2176 1593684789 : for( ulong idx=1UL; idx<subtree_cnt-1UL; idx++ )
2177 1006405926 : BPLUS_TEST( BPLUS_(private_key_cmp)( &subtree_pivot[idx-1UL], &subtree_pivot[idx] )<0 );
2178 :
2179 587278863 : if( FD_LIKELY( key_hi ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( &subtree_pivot[subtree_cnt-2UL], key_hi )<0 );
2180 587278863 : }
2181 :
2182 : /* At this point, tree_off is a bplus global offset of a
2183 : verified node (verified either just now or on a previous
2184 : iteration). If subtree_idx isn't the last subtree, push
2185 : subtree_idx+1 onto the stack for a later iteration. */
2186 :
2187 2180963652 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
2188 :
2189 2180963652 : BPLUS_KEY_T const * subtree_pivot = node->pivot;
2190 2180963652 : ulong const * subtree_off = node->tree_off;
2191 2180963652 : ulong subtree_cnt = node->tree_cnt;
2192 :
2193 2180963652 : if( FD_LIKELY( (subtree_idx+1UL)<subtree_cnt ) ) {
2194 1593684789 : BPLUS_TEST( stack_cnt<stack_max );
2195 1593684789 : stack_tree_off [ stack_cnt ] = tree_off;
2196 1593684789 : stack_subtree_idx[ stack_cnt ] = subtree_idx+1UL;
2197 1593684789 : stack_key_lo [ stack_cnt ] = key_lo;
2198 1593684789 : stack_key_hi [ stack_cnt ] = key_hi;
2199 1593684789 : stack_cnt++;
2200 1593684789 : }
2201 :
2202 : /* And recurse into subtree_idx for the next iteration. Note
2203 : this node's key_lo is subtree_idx 0's key_lo and this node's
2204 : key_hi is subtree_idx tree_cnt-1's key_hi. */
2205 :
2206 2180963652 : /**/ tree_off = subtree_off [ subtree_idx ];
2207 2180963652 : if( FD_LIKELY( subtree_idx>0UL ) ) key_lo = &subtree_pivot[ subtree_idx-1UL ];
2208 2180963652 : if( FD_LIKELY( subtree_idx<subtree_cnt-1UL ) ) key_hi = &subtree_pivot[ subtree_idx ];
2209 2180963652 : subtree_idx = 0UL;
2210 2180963652 : continue;
2211 2180963652 : }
2212 :
2213 : /* At this point, tree is a leaf. Validate no loops. */
2214 :
2215 1595557959 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2216 :
2217 : /* Validate the leaf pointer */
2218 :
2219 1595557959 : BPLUS_TEST( leaf_lo<=tree_off ); BPLUS_TEST( tree_off<leaf_hi );
2220 1595557959 : BPLUS_TEST( fd_ulong_is_aligned( tree_off, BPLUS_LEAF_ALIGN ) );
2221 :
2222 1595557959 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
2223 :
2224 1595557959 : BPLUS_PAIR_T const * pair = leaf->pair;
2225 1595557959 : ulong pair_cnt = leaf->pair_cnt;
2226 :
2227 : /* Validate the leaf pair count */
2228 :
2229 1595557959 : BPLUS_TEST( fd_ulong_if( tree_off!=root_off, pair_min, 1UL )<=pair_cnt );
2230 1595557959 : BPLUS_TEST( pair_cnt<=pair_max );
2231 :
2232 : /* Validate the leaf pairs */
2233 :
2234 1595557959 : if( FD_LIKELY( key_lo ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( key_lo, &pair[0].BPLUS_PAIR_KEY )<=0 );
2235 :
2236 4031130432 : for( ulong idx=1UL; idx<pair_cnt; idx++ )
2237 2435572473 : BPLUS_TEST( BPLUS_(private_key_cmp)( &pair[idx-1UL].BPLUS_PAIR_KEY, &pair[idx].BPLUS_PAIR_KEY )<0 );
2238 :
2239 1595557959 : if( FD_LIKELY( key_hi ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( &pair[ pair_cnt-1UL ].BPLUS_PAIR_KEY, key_hi )<0 );
2240 :
2241 : /* (Note that we validate the leaf ordered iterator below.) */
2242 :
2243 : /* If no more work to do, abort. Otherwise, get the next node to
2244 : process. */
2245 :
2246 1595557959 : if( FD_UNLIKELY( !stack_cnt ) ) break;
2247 1593684789 : stack_cnt--;
2248 1593684789 : tree_off = stack_tree_off [ stack_cnt ];
2249 1593684789 : subtree_idx = stack_subtree_idx[ stack_cnt ];
2250 1593684789 : key_lo = stack_key_lo [ stack_cnt ];
2251 1593684789 : key_hi = stack_key_hi [ stack_cnt ];
2252 1593684789 : }
2253 1873170 : }
2254 :
2255 : /* Validate all nodes and leaves touched */
2256 :
2257 1873254 : BPLUS_TEST( !node_rem );
2258 1873254 : BPLUS_TEST( !leaf_rem );
2259 :
2260 : /* Validate leaf iteration */
2261 :
2262 1873254 : leaf_rem = leaf_cnt;
2263 :
2264 1873254 : ulong leaf_prev_off = 0UL;
2265 1873254 : /**/ leaf_off = bplus->leaf_min_off;
2266 1597431213 : while( leaf_off ) { /* Validates leaf->next_off for last iteration */
2267 :
2268 : /* Validate no loops */
2269 :
2270 1595557959 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2271 :
2272 : /* Validate forward iteration (validates bplus->leaf_min_off first
2273 : iteration, validates leaf->next_off interior iterations) */
2274 :
2275 1595557959 : BPLUS_TEST( leaf_lo<=leaf_off ); BPLUS_TEST( leaf_off<leaf_hi );
2276 1595557959 : BPLUS_TEST( fd_ulong_is_aligned( leaf_off, BPLUS_LEAF_ALIGN ) );
2277 1595557959 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
2278 :
2279 : /* Validate reverse iteration (validates leaf->prev_off,
2280 : bplus->leaf_max_off validated below) */
2281 :
2282 1595557959 : BPLUS_TEST( leaf->prev_off==leaf_prev_off );
2283 :
2284 : /* Validate ordered leaves */
2285 :
2286 1595557959 : if( FD_LIKELY( leaf_prev_off ) ) {
2287 1593684789 : BPLUS_(private_leaf_t) const * prev = BPLUS_(private_leaf_const)( bplus, leaf_prev_off );
2288 1593684789 : BPLUS_TEST( BPLUS_(private_key_cmp)( &prev->pair[ prev->pair_cnt-1UL ].BPLUS_PAIR_KEY, &leaf->pair[ 0 ].BPLUS_PAIR_KEY )<0 );
2289 1593684789 : }
2290 :
2291 1595557959 : leaf_prev_off = leaf_off;
2292 1595557959 : leaf_off = leaf->next_off;
2293 1595557959 : }
2294 :
2295 1873254 : BPLUS_TEST( bplus->leaf_max_off==leaf_prev_off ); /* Validates bplus->leaf_max_off */
2296 1873254 : BPLUS_TEST( !leaf_rem ); /* All leaves in tree covered */
2297 :
2298 1873254 : # undef BPLUS_TEST
2299 :
2300 1873254 : return 0;
2301 1873254 : }
2302 :
2303 : #endif
2304 :
2305 : #undef BPLUS_STATIC
2306 : #undef BPLUS_
2307 :
2308 : #undef BPLUS_IMPL_STYLE
2309 : #undef BPLUS_MAGIC
2310 : #undef BPLUS_LEAF_ALIGN
2311 : #undef BPLUS_NODE_ALIGN
2312 : #undef BPLUS_ALIGN
2313 : #undef BPLUS_TREE_MAX
2314 : #undef BPLUS_NODE_MAX
2315 : #undef BPLUS_PAIR_T
2316 : #undef BPLUS_KEY_T
2317 : #undef BPLUS_NAME
|