Line data Source code
1 : /* Generate prototypes, inlines and/or implementations for an ultra high
2 : performance bplus-tree-based key-val store. A bplus tree can be
3 : persisted beyond the lifetime of creating process, used concurrently,
4 : used IPC, relocated in memory, naively serialized/deserialized and/or
5 : moved between hosts. Virtually all operations on a bplus tree are a
6 : fast O(lg N) (where N is the number of elements stored) or better in
7 : worst case.
8 :
9 : At its core, this is fast binary search on a sorted array. But the
10 : sorted array has been partition into leaves where each leaf is
11 : responsible for a continuous disjoint portion of the key space and
12 : union of the ranges covered by the leaves covers the entire key
13 : space. The leaves are stored in a tree whose nodes have a large and
14 : flexible number of branches per each node that specify how leaves
15 : completely partition key space. Further, to support fast forward and
16 : reverse iteration, the leaves are organized into a sorted doubly
17 : linked list. Lastly, the interior nodes and leaves are guaranteed to
18 : be full enough that query has a fast O(lg N) worst case and have
19 : enough slack that insert / upsert / remove also have fast O(lg N)
20 : worst case.
21 :
22 : This leads to a number of improvements over textbook bplus trees,
23 : including:
24 :
25 : - Removal doesn't require nearly as much reshuffling of the interior
26 : nodes. The only requirement here is that interior nodes form a
27 : complete partitioning of the key space. (There is no requirement
28 : that interior nodes store copy of keys found in leaf nodes.)
29 :
30 : - No extra storage in the node is required for identifying whether or
31 : not an interior node or a leaf.
32 :
33 : - The leaf pair max radix can be independently tuned from the
34 : interior node pair radix (especially useful if sizing interior
35 : nodes and leaves to match things like memory page sizes).
36 :
37 : - Supports fast reverse iteration.
38 :
39 : Typical usage:
40 :
41 : struct mypair {
42 : mykey_t key;
43 :
44 : ... key can be located arbitrarily in struct (and renamed if
45 : ... needed ... see BPLUS_PAIR_KEY). It is managed by the bplus
46 : ... tree and should not be modified).
47 :
48 : ... IMPORTANT SAFETY TIP! The location of a pair can be changed
49 : ... by insert / upsert / remove operations.
50 :
51 : };
52 :
53 : typedef struct mypair mypair_t;
54 :
55 : #define BPLUS_NAME mybplus
56 : #define BPLUS_KEY_T mykey_t
57 : #define BPLUS_PAIR_T mypair_t
58 : #define BPLUS_KEY_CMP(a,b) mykeycmp( (a), (b) )
59 : #include "fd_bplus.c"
60 :
61 : will provide the following APIs as a header only style library in the
62 : compilation unit:
63 :
64 : // A myplus_t is an opaque handle to a join to a bplus tree
65 :
66 : struct mybplus_private;
67 : typedef struct mybplus_private bplus_t;
68 :
69 : // A myplus_iter_t is an opaque handle to a bplus tree iterator
70 :
71 : struct mybplus_private_iter;
72 : typedef struct mybplus_private_iter mybplus_iter_t;
73 :
74 : // Constructors
75 :
76 : // mybplus_{leaf,node}_max_est returns a conservative estimate of
77 : // the number of {leaves,nodes} needed for a worst case bplus tree
78 : // containing ele_max_est elements.
79 :
80 : ulong mybplus_leaf_max_est( ulong ele_max_est );
81 : ulong mybplus_node_max_est( ulong ele_max_est );
82 :
83 : // mybplus_{align,footprint,new,join,leave,delete} have the usual
84 : // persistent IPC object constructors / destructors semantics.
85 :
86 : ulong mybplus_align ( void );
87 : ulong mybplus_footprint( ulong node_max, ulong leaf_max );
88 :
89 : void * mybplus_new ( void * shmem, ulong node_max, ulong leaf_max );
90 : mybplus_t * mybplus_join ( void * shbplus );
91 : void * mybplus_leave ( mybplus_t * join );
92 : void * mybplus_delete( void * shbplus );
93 :
94 : // Accessors
95 :
96 : // mybplus_{node,leaf}_max return the {node,leaf}_max values used
97 : // to construct the bplus tree. Assumes join is a current local
98 : // join. Fast O(1) worst case.
99 :
100 : ulong mybplus_node_max( mybplus_t const * join );
101 : ulong mybplus_leaf_max( mybplus_t const * join );
102 :
103 : // mybplus_is_empty returns 1 if the bplus tree contains no pairs
104 : // and 0 otherwise. Assumes join is a current local join. Fast
105 : // O(1) worst case.
106 :
107 : int mybplus_is_empty( mybplus_t const * join );
108 :
109 : // mybplus_{min,max} return the pointer in the caller's local
110 : // address space to the pair in the bplus tree with the {min,max}
111 : // key. Assumes join is a current local join and bplus tree is not
112 : // empty. The lifetime of the returned pointer is the lesser of
113 : // the lifetime of the local join or the next insert / upsert /
114 : // remove operation on the bplus tree. The bplus tree retains
115 : // ownership of the returned pair and the caller should not modify
116 : // the pair key field. Fast O(1) worst case.
117 : //
118 : // mybplus_{min,max}_const is a const-correct version.
119 :
120 : mypair_t const * mybplus_min_const( mybplus_t const * join );
121 : mypair_t const * mybplus_max_const( mybplus_t const * join );
122 : mypair_t * mybplus_min ( mybplus_t * join );
123 : mypair_t * mybplus_max ( mybplus_t * join );
124 :
125 : // mybplus_query returns the pointer in the caller's local address
126 : // space to the pair in the bplus tree that matches the key pointed
127 : // to by query or NULL if there is no key matching query in the
128 : // bplus tree. Assumes join is a current local join. The lifetime
129 : // of the returned pointer is the lesser of the lifetime of the
130 : // local join or the next insert / upsert / remove operation on the
131 : // bplus tree. The bplus tree retains ownership of the returned
132 : // pair and the caller should not modify the key field. The bplus
133 : // tree has no interest in query in return. Fast O(lg N) worst
134 : // case.
135 : //
136 : // mybplus_query_const is a const-correct version.
137 :
138 : mypair_t const * mybplus_query_const( mybplus_t const * join, mykey_t const * query );
139 : mypair_t * mybplus_query( mybplus_t * join, mykey_t const * query );
140 :
141 : // Operations
142 :
143 : // mybplus_insert inserts a key into the bplus tree. Assumes join
144 : // is a current local join and key points in the caller's address
145 : // space to the key to insert. The bplus tree has no interest in
146 : // key in return.
147 : //
148 : // On success, returns the location in the caller's address space
149 : // where key was inserted. The lifetime of the returned pointer is
150 : // the lesser of the lifetime of the local join or there is an
151 : // insert / upsert / remove operation on the bplus tree. The
152 : // caller should not modify the pair key field but is free to
153 : // modify all the other values.
154 : //
155 : // On failure, returns NULL. Reasons for failure are the key was
156 : // already in the tree (locations of pairs might have changed),
157 : // there were not enough nodes (locations of pairs did not change)
158 : // or there were not enough leaves available to complete the insert
159 : // (locations of pairs did not change).
160 : //
161 : // mybplus_upsert is nearly equivalent to:
162 : //
163 : // int insert;
164 : // mypair_t * pair = mybplus_query ( join, key ); insert = 0;
165 : // if( !pair ) { pair = mybplus_insert( join, key ); insert = 1; }
166 : // if( pair && _opt_insert ) *_opt_insert = insert;
167 : //
168 : // but potentially faster as it only traverses the bplus tree once.
169 : // The "nearly" qualifier is that, unlike the above snippet, the
170 : // upsert might change the location of keys even if key is already
171 : // in the bplus tree. Fast O(lg N) worst case.
172 :
173 : mypair_t * mybplus_insert( mybplus_t * join, mykey_t const * key );
174 : mypair_t * mybplus_upsert( mybplus_t * join, mykey_t const * key, int * _opt_insert );
175 :
176 : // mybplus_remove_key removes a key from the bplus tree. Assumes
177 : // join is a current local join and key points in the caller's
178 : // address space to the key to remove. Returns 0 on success and -1
179 : // if the key was not found in the tree. The bplus tree has no
180 : // interest in key in return. Fast O(lg N) worst case.
181 :
182 : int mybplus_remove_key( mybplus_t * join, mykey_t const * key );
183 :
184 : // mybplus_remove removes the pair pointed to by pair from the
185 : // bplus tree. Assumes join is a current local join and pair is a
186 : // pointer in the caller's local address space to a pair that is
187 : // currently in the bplus tree. The pair is no longer in the bplus
188 : // tree on return. Fast O(lg N) worst case.
189 :
190 : void mybplus_remove( mybplus_t * join, mypair_t * pair );
191 :
192 : // mybplus_flush removes all pairs from the bplus tree. Assumes
193 : // join is a current local join. There are no pairs in the bplus
194 : // tree on return. Fast O( node_max + leaf_max ) worst case.
195 :
196 : void mybplus_flush( mybplus_t * join );
197 :
198 : // mybplus_verify validates the bplus tree pointed by join.
199 : // Returns 0 on success and -1 on failure (logs details).
200 : // O(node_max+leaf_max) worst case.
201 :
202 : int mybplus_verify( mybplus_t const * join );
203 :
204 : // Iteration
205 :
206 : // mybplus_iter_nul returns an iterator positioned at nul. Fast
207 : // O(1) worst case.
208 : //
209 : // mybplus_iter_min returns an iterator positioned at the min pair
210 : // or nul if the bplus is empty. Fast O(1) worst case.
211 : //
212 : // mybplus_iter_max returns an iterator positioned at the max pair
213 : // or nul if the bplus is empty. Fast O(1) worst case.
214 : //
215 : // mybplus_iter_ge returns an iterator positioned at the first pair
216 : // greater than or equal to query or at nul if all keys are less
217 : // than query. query==NULL is equivalent to "+inf". Fast O(lg N)
218 : // worst case.
219 : //
220 : // mybplus_iter_gt returns an iterator positioned at the first pair
221 : // greater than query or at nul if all keys are less than or equal
222 : // to query. query==NULL is equivalent to "+inf". Fast O(lg N)
223 : // worst case.
224 : //
225 : // mybplus_iter_le returns an iterator positioned at the last pair
226 : // less than or equal to query or at nul if all keys are greater
227 : // than query. query==NULL is equivalent to "-inf". Fast O(lg N)
228 : // worst case.
229 : //
230 : // mybplus_iter_lt returns an iterator positioned at the last pair
231 : // less than to query or at nul if all keys are greater than or
232 : // equal to query. query==NULL is equivalent to "-inf". Fast
233 : // O(lg N) worst case.
234 : //
235 : // mybplus_iter_next returns an iterator positioned at the next
236 : // pair or nul if the iterator is currently positioned at last
237 : // pair. Fast O(1) worst case.
238 : //
239 : // mybplus_iter_prev returns an iterator positioned at the previous
240 : // pair or nul if the iterator is currently positioned at first
241 : // pair. Fast O(1) worst case.
242 : //
243 : // mybplus_iter_eq returns true if iter is positioned at the same
244 : // place fini is positioned. Fast O(1) worst case.
245 : //
246 : // mybplus_iter_pair returns a pair associated with the current
247 : // iteration position. mybplus_iter_pair_const is a const correct
248 : // version. Fast O(1) worst case.
249 : //
250 : // Assumes join is a current local join and query points to a valid
251 : // key in the caller's local address space. Retains no interest in
252 : // query on return.
253 : //
254 : // Example: iterate over all pairs in ascending order:
255 : //
256 : // for( mybplus_iter_t iter = mybplus_iter_min( bplus );
257 : // !mybplus_iter_eq_nul( bplus, iter );
258 : // iter = mybplus_iter_next( bplus, iter ) ) {
259 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
260 : // ... process pair here
261 : // ... do not insert, upsert remove keys from bplus here
262 : // ... do not modify key of pair here
263 : // }
264 : //
265 : // Example: iterate over all pairs in descending order:
266 : //
267 : // for( mybplus_iter_t iter = mybplus_iter_max( bplus );
268 : // !mybplus_iter_eq_nul( bplus, iter );
269 : // iter = mybplus_iter_prev( bplus, iter ) ) {
270 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
271 : // ... process pair here
272 : // ... do not insert, upsert remove keys from bplus here
273 : // ... do not modify key of pair here
274 : // }
275 : //
276 : // Example: iterate over all pairs with keys in [key0,key1) in
277 : // ascending order (assumes key1>=key0):
278 : //
279 : // mybplus_iter_t iter = mybplus_iter_ge( bplus, key0 );
280 : // mybplus_iter_t fini = mybplus_iter_ge( bplus, key1 ); // key1==NULL will iterate over all pairs with keys >= key0
281 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
282 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
283 : //
284 : // ... process pair here
285 : // ... do not insert, upsert or remove keys from bplus here
286 : // ... do not modify key of pair here
287 : //
288 : // iter = mybplus_iter_next( bplus, iter );
289 : // }
290 : //
291 : // Example: iterate over all pairs with keys in [key0,key1] in
292 : // ascending order (assumes key1>=key0):
293 : //
294 : // mybplus_iter_t iter = mybplus_iter_ge( bplus, key0 );
295 : // mybplus_iter_t fini = mybplus_iter_gt( bplus, key1 ); // key1==NULL will iterate over all pairs with keys >= key0
296 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
297 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
298 : //
299 : // ... process pair here
300 : // ... do not insert, upsert or remove keys from bplus here
301 : // ... do not modify key of pair here
302 : //
303 : // iter = mybplus_iter_next( bplus, iter );
304 : // }
305 : //
306 : // Example: iterate over all pairs with keys in (key0,key1) in
307 : // ascending order (assumes key1>=key0):
308 : //
309 : // mybplus_iter_t iter = mybplus_iter_gt( bplus, key0 );
310 : // mybplus_iter_t fini = mybplus_iter_ge( bplus, key1 ); // key1==NULL will iterate over all pairs with keys > key0
311 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
312 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
313 : //
314 : // ... process pair here
315 : // ... do not insert, upsert or remove keys from bplus here
316 : // ... do not modify key of pair here
317 : //
318 : // iter = mybplus_iter_next( bplus, iter );
319 : // }
320 : //
321 : // Example: iterate over all pairs with keys in (key0,key1] in
322 : // ascending order (assumes key1>=key0):
323 : //
324 : // mybplus_iter_t iter = mybplus_iter_gt( bplus, key0 );
325 : // mybplus_iter_t fini = mybplus_iter_gt( bplus, key1 ); // key1==NULL will iterate over all pairs with keys > key0
326 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
327 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
328 : //
329 : // ... process pair here
330 : // ... do not insert, upsert or remove keys from bplus here
331 : // ... do not modify key of pair here
332 : //
333 : // iter = mybplus_iter_next( bplus, iter );
334 : // }
335 : //
336 : // Example: iterate over all pairs with keys in [key0,key1) in
337 : // descending order (assumes key1>=key0):
338 : //
339 : // mybplus_iter_t iter = mybplus_iter_lt( bplus, key1 );
340 : // mybplus_iter_t fini = mybplus_iter_lt( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
341 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
342 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
343 : //
344 : // ... process pair here
345 : // ... do not insert, upsert or remove keys from bplus here
346 : // ... do not modify key of pair here
347 : //
348 : // iter = mybplus_iter_prev( bplus, iter );
349 : // }
350 : //
351 : // Example: iterate over all pairs with keys in [key0,key1] in
352 : // descending order (assumes key1>=key0):
353 : //
354 : // mybplus_iter_t iter = mybplus_iter_le( bplus, key1 );
355 : // mybplus_iter_t fini = mybplus_iter_lt( bplus, key0 ); // key0==NULL will iterate over all pairs with keys <= key1
356 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
357 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
358 : //
359 : // ... process pair here
360 : // ... do not insert, upsert or remove keys from bplus here
361 : // ... do not modify key of pair here
362 : //
363 : // iter = mybplus_iter_prev( bplus, iter );
364 : // }
365 : //
366 : // Example: iterate over all pairs with keys in (key0,key1) in
367 : // descending order (assumes key1>=key0):
368 : //
369 : // mybplus_iter_t iter = mybplus_iter_lt( bplus, key1 );
370 : // mybplus_iter_t fini = mybplus_iter_le( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
371 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
372 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
373 : //
374 : // ... process pair here
375 : // ... do not insert, upsert or remove keys from bplus here
376 : // ... do not modify key of pair here
377 : //
378 : // iter = mybplus_iter_prev( bplus, iter );
379 : // }
380 : //
381 : // Example: iterate over all pairs with keys in (key0,key1] in
382 : // descending order (assumes key1>=key0):
383 : //
384 : // mybplus_iter_t iter = mybplus_iter_le( bplus, key1 );
385 : // mybplus_iter_t fini = mybplus_iter_le( bplus, key0 ); // key0==NULL will iterate over all pairs with keys < key1
386 : // while( !mybplus_iter_eq( bplus, iter, fini ) ) {
387 : // mypair_t * pair = mybplus_iter_pair( bplus, iter );
388 : //
389 : // ... process pair here
390 : // ... do not insert, upsert or remove keys from bplus here
391 : // ... do not modify key of pair here
392 : //
393 : // iter = mybplus_iter_prev( bplus, iter );
394 : // }
395 :
396 : mybplus_iter_t mybplus_iter_nul( mybplus_t const * join );
397 : mybplus_iter_t mybplus_iter_min( mybplus_t const * join );
398 : mybplus_iter_t mybplus_iter_max( mybplus_t const * join );
399 :
400 : mybplus_iter_t mybplus_iter_ge( mybplus_t const * join, mykey_t const * query );
401 : mybplus_iter_t mybplus_iter_gt( mybplus_t const * join, mykey_t const * query );
402 : mybplus_iter_t mybplus_iter_le( mybplus_t const * join, mykey_t const * query );
403 : mybplus_iter_t mybplus_iter_lt( mybplus_t const * join, mykey_t const * query );
404 :
405 : int mybplus_iter_eq ( mybplus_t const * join, mybplus_iter_t i0, mybplus_iter_t i1 );
406 : int mybplus_iter_eq_nul( mybplus_t const * join, mybplus_iter_t iter );
407 :
408 : mybplus_iter_t mybplus_iter_next( mybplus_t const * join, mybplus_iter_t iter );
409 : mybplus_iter_t mybplus_iter_prev( mybplus_t const * join, mybplus_iter_t iter );
410 :
411 : mypair_t const * mybplus_iter_pair_const( mybplus_t const * join, mybplus_iter_t iter );
412 : mypair_t * mybplus_iter_pair ( mybplus_t * join, mybplus_iter_t iter );
413 :
414 : You can do this as often as you like in a compilation unit to get
415 : different types of bplus trees. Variants exist for making header
416 : protoypes only and/or implementations if doing a library with
417 : multiple compilation units. Further, options exist to use different
418 : hashing functions, comparison functions, etc as detailed below. */
419 :
420 : /* BPLUS_NAME gives the API prefix. */
421 :
422 : #ifndef BPLUS_NAME
423 : #error "Define BPLUS_NAME"
424 : #endif
425 :
426 : /* BPLUS_KEY_T gives the key type. Should be a plain-old-data type with
427 : a total order. */
428 :
429 : #ifndef BPLUS_KEY_T
430 : #error "Define BPLUS_KEY_T"
431 : #endif
432 :
433 : /* BPLUS_PAIR_T gives the pair type. Should be a structure of the form:
434 :
435 : typedef struct BPLUS_PAIR {
436 : BPLUS_KEY_T key; // Can be arbitrarily placed in structure, should not be modified by the user
437 : ... arbitrary user fields
438 : } BPLUS_PAIR_T;
439 :
440 : (Or the appropriate field name given BPLUS_PAIR_KEY below.) */
441 :
442 : #ifndef BPLUS_PAIR_T
443 : #error "Define BPLUS_PAIR_T"
444 : #endif
445 :
446 : /* BPLUS_PAIR_KEY gives the name of the key field in the BPLUS_PAIR_KEY.
447 : Defaults to key. */
448 :
449 : #ifndef BPLUS_PAIR_KEY
450 : #define BPLUS_PAIR_KEY key
451 : #endif
452 :
453 : /* BPLUS_KEY_CMP compares the keys pointed to by a and b and returns
454 : {<0,0,>0} if the a is {less than,equal to,greater than}. a and b
455 : will be valid pointers to key . Defaults to memcmp based. */
456 :
457 : #ifndef BPLUS_KEY_CMP
458 : #define BPLUS_KEY_CMP(a,b) memcmp( (a), (b), sizeof(*(a)) )
459 : #endif
460 :
461 : /* BPLUS_TREE_MAX is the maximum number of children a non-leaf node can
462 : have. Must be even, >=4 and <<< ULONG_MAX. Defaults to 128. */
463 :
464 : #ifndef BPLUS_TREE_MAX
465 : #define BPLUS_TREE_MAX 128
466 : #endif
467 :
468 : /* BPLUS_PAIR_MAX is the maximum number of children a leaf node can
469 : have. Must be even, >=4 and <<< ULONG_MAX. Defaults to 128. */
470 :
471 : #ifndef BPLUS_PAIR_MAX
472 : #define BPLUS_PAIR_MAX 128
473 : #endif
474 :
475 : /* BPLUS_ALIGN gives the default alignment of the BPLUS region. Should
476 : be a positive integer power of 2. Defaults to 128. */
477 :
478 : #ifndef BPLUS_ALIGN
479 9 : #define BPLUS_ALIGN 128
480 : #endif
481 :
482 : /* BPLUS_NODE_ALIGN gives the default alignment of an interior node.
483 : Should be a positive integer power of 2 of at most BPLUS_ALIGN.
484 : Defaults to 128. */
485 :
486 : #ifndef BPLUS_NODE_ALIGN
487 587278872 : #define BPLUS_NODE_ALIGN 128
488 : #endif
489 :
490 : /* BPLUS_LEAF_ALIGN gives the default alignment of a leaf node. Should
491 : be a positive integer power of 2 of at most BPLUS_ALIGN. Defaults to
492 : 128. */
493 :
494 : #ifndef BPLUS_LEAF_ALIGN
495 587278872 : #define BPLUS_LEAF_ALIGN 128
496 : #endif
497 :
498 : /* BPLUS_MAGIC is the structure magic number to use to aid in persistent
499 : and or IPC usage. */
500 :
501 : #ifndef BPLUS_MAGIC
502 3 : #define BPLUS_MAGIC (0xfdb91c53a61c0000UL) /* FD BPLUS MAGIC 0000 */
503 : #endif
504 :
505 : /* BPLUS_IMPL_STYLE indicates what this generator should output:
506 : 0 - static implementation
507 : 1 - library header
508 : 2 - library implementation */
509 :
510 : #ifndef BPLUS_IMPL_STYLE
511 : #define BPLUS_IMPL_STLYE 0
512 : #endif
513 :
514 : /**********************************************************************/
515 :
516 20959878411 : #define BPLUS_(name)FD_EXPAND_THEN_CONCAT3(BPLUS_NAME,_,name)
517 :
518 : #if BPLUS_IMPL_STYLE==0
519 : #define BPLUS_STATIC FD_FN_UNUSED static
520 : #else
521 : #define BPLUS_STATIC
522 : #endif
523 :
524 : #if BPLUS_IMPL_STYLE==0 || BPLUS_IMPL_STYLE==1
525 :
526 : /* Header *************************************************************/
527 :
528 : #include "../log/fd_log.h"
529 :
530 : struct BPLUS_(private);
531 : typedef struct BPLUS_(private) BPLUS_(t);
532 :
533 : struct BPLUS_(private_iter);
534 : typedef struct BPLUS_(private_iter) BPLUS_(iter_t);
535 :
536 : /* Internal use only */
537 :
538 : /* A bplus_private_node_t is used for finding leaves that might contain
539 : an element fast. */
540 :
541 : struct __attribute__((aligned(BPLUS_NODE_ALIGN))) BPLUS_(private_node) {
542 :
543 : /* This point is BPLUS_NODE_ALIGN aligned */
544 :
545 : ulong tree_cnt; /* if acquired, in [0,BPLUS_TREE_MAX], else ignored */
546 : ulong tree_off[ BPLUS_TREE_MAX ]; /* if acquired, indexed [0,tree_cnt), else
547 : tree_off[0]==node pool next offset (0 if last node in pool) */
548 : BPLUS_KEY_T pivot [ BPLUS_TREE_MAX-1 ]; /* if acquired, indexed [0,tree_cnt-1), else ignored */
549 :
550 : /* tree i handles keys in [ pivot[i-1], pivot[i] ), pivot[-1] /
551 : pivot[tree_cnt-1] are implied to be the previous / next pivot in an
552 : in-order traversal of the bplus tree node pivots (or -/+inf if
553 : leftmost/rightmost). */
554 : };
555 :
556 : typedef struct BPLUS_(private_node) BPLUS_(private_node_t);
557 :
558 : /* A bplus_private_leaf_t holds up to pair_cnt elements of pairs in the
559 : tree in a sorted order. */
560 :
561 : struct __attribute__((aligned(BPLUS_LEAF_ALIGN))) BPLUS_(private_leaf) {
562 :
563 : /* This point is BPLUS_LEAF_ALIGN aligned */
564 :
565 : ulong pair_cnt; /* if acquired, in [0,BPLUS_PAIR_MAX], else ignored */
566 : ulong prev_off; /* if acquired, prev leaf offset (or 0 if first leaf), else ignored */
567 : ulong next_off; /* if acquired, next leaf offset (or 0 if last leaf),
568 : else leaf pool next offset (0 if last node in pool) */
569 : BPLUS_PAIR_T pair[ BPLUS_PAIR_MAX ]; /* if acquired, indexed [0,pair_cnt), unique keys and in ascending order, else ignored */
570 : };
571 :
572 : typedef struct BPLUS_(private_leaf) BPLUS_(private_leaf_t);
573 :
574 : /* A bplus_private_t is a continguous region of memory that holds a
575 : bplus tree. Important invariants:
576 :
577 : - Empty trees have no root.
578 : - If root is a leaf, it has [1,pair_max] pairs.
579 : - If root is a node, it has [2,tree_max] trees.
580 : - Non-root nodes have [tree_min,tree_max] trees.
581 : - Non-root leaves have [pair_min,pair_max] pairs.
582 : - Children of a node are not a mix of nodes and leaves. */
583 :
584 : struct __attribute__((aligned(BPLUS_ALIGN))) BPLUS_(private) {
585 :
586 : /* This point is aligned BPLUS_ALIGN */
587 :
588 : ulong magic; /* ==BPLUS_MAGIC */
589 : ulong node_max; ulong leaf_max; /* maximum number of node/leaf in the store */
590 : ulong node_lo; ulong leaf_lo; /* offset from the first byte of bplus header to the node/leaf storage */
591 : ulong node_pool_off; ulong leaf_pool_off; /* first node/leaf in node/leaf pool, 0 if no node/leaf in pool */
592 : ulong root_off; /* offset of node/leaf to tree root (or 0 if empty) */
593 : ulong leaf_min_off; /* offset of leaf with minimum pair (or 0 if empty) */
594 : ulong leaf_max_off; /* offset of leaf with maximum pair (or 0 if empty) */
595 :
596 : /* padding to BPLUS_NODE_ALIGN here */
597 : /* node_lo points here, node_max elements, indexed [0,node_max) */
598 : /* padding to BPLUS_LEAF_ALIGN here */
599 : /* leaf_lo points here, leaf_max elements, indexed [0,leaf_max) */
600 : /* padding to BPLUS_ALIGN here */
601 :
602 : };
603 :
604 : typedef struct BPLUS_(private) BPLUS_(private_t);
605 :
606 : struct BPLUS_(private_iter) {
607 : ulong leaf_off; /* offset to current leaf */
608 : ulong pair_idx; /* current pair in current leaf */
609 : };
610 :
611 : FD_PROTOTYPES_BEGIN
612 :
613 : /* bplus_private_{pair,tree}_{min,max} return the corresponding
614 : configuration values for this bplus implementation. */
615 :
616 0 : FD_FN_CONST static inline ulong BPLUS_(private_pair_min)( void ) { return (ulong)(BPLUS_PAIR_MAX/2); } /* exact */
617 0 : FD_FN_CONST static inline ulong BPLUS_(private_pair_max)( void ) { return (ulong) BPLUS_PAIR_MAX; }
618 :
619 0 : FD_FN_CONST static inline ulong BPLUS_(private_tree_min)( void ) { return (ulong)(BPLUS_TREE_MAX/2); } /* exact */
620 0 : FD_FN_CONST static inline ulong BPLUS_(private_tree_max)( void ) { return (ulong) BPLUS_TREE_MAX; }
621 :
622 : /* bplus_private_{node,leaf}_max_max return a value for {node,leaf}_max
623 : such that the {node,leaf} storage of the bplus tree will require at
624 : most 2^62 bytes. */
625 :
626 : FD_FN_CONST static inline ulong
627 0 : BPLUS_(private_node_max_max)( void ) {
628 0 : return ((1UL<<62)-BPLUS_NODE_ALIGN+1UL) / sizeof( BPLUS_(private_node_t));
629 0 : }
630 :
631 : FD_FN_CONST static inline ulong
632 0 : BPLUS_(private_leaf_max_max)( void ) {
633 0 : return ((1UL<<62)-BPLUS_LEAF_ALIGN+1UL) / sizeof( BPLUS_(private_leaf_t));
634 0 : }
635 :
636 : /* bplus_private_key_cmp gives BPLUS_KEY_CMP the exact function
637 : signature used by the below implementations. */
638 :
639 : FD_FN_PURE static inline int
640 : BPLUS_(private_key_cmp)( BPLUS_KEY_T const * a,
641 9880452882 : BPLUS_KEY_T const * b ) {
642 9880452882 : return BPLUS_KEY_CMP(a,b);
643 9880452882 : }
644 :
645 : /* bplus_private_is_leaf returns 1 if the root of the tree at bplus
646 : global offset is a leaf or 0 if it is a node. leaf_lo is the bplus
647 : global offset of the leaf preallocated storage. Assumes tree_off and
648 : leaf_lo are valid. */
649 :
650 4640923854 : FD_FN_CONST static inline int BPLUS_(private_is_leaf)( ulong tree_off, ulong leaf_lo ) { return tree_off>=leaf_lo; }
651 :
652 : /* bplus_private returns location of the bplus private metadata in the
653 : caller's address space given a valid local join. Lifetime of the
654 : returned pointer is the lifetime of the join. bplus_private_const is
655 : a const correct version. */
656 :
657 : FD_FN_CONST static inline BPLUS_(private_t) *
658 16917414 : BPLUS_(private)( BPLUS_(t) * join ) {
659 16917414 : return (BPLUS_(private_t) *)join;
660 16917414 : }
661 :
662 : FD_FN_CONST static inline BPLUS_(private_t) const *
663 80622819 : BPLUS_(private_const)( BPLUS_(t) const * join ) {
664 80622819 : return (BPLUS_(private_t) const *)join;
665 80622819 : }
666 :
667 : /* bplus_private_{node,leaf} return the pointer in the caller's local
668 : address space of the {node,leaf} located at bplus global
669 : {node,leaf}_off. The lifetime of the returned pointer is the
670 : lifetime of the local join. Assumes bplus and node_off are valid. */
671 :
672 : FD_FN_CONST static inline BPLUS_(private_node_t) *
673 : BPLUS_(private_node)( BPLUS_(private_t) * bplus,
674 71810601 : ulong node_off ) {
675 71810601 : return (BPLUS_(private_node_t) *)((ulong)bplus + node_off);
676 71810601 : }
677 :
678 : FD_FN_CONST static inline BPLUS_(private_leaf_t) *
679 : BPLUS_(private_leaf)( BPLUS_(private_t) * bplus,
680 23130888 : ulong leaf_off ) {
681 23130888 : return (BPLUS_(private_leaf_t) *)((ulong)bplus + leaf_off);
682 23130888 : }
683 :
684 : FD_FN_CONST static inline BPLUS_(private_node_t) const *
685 : BPLUS_(private_node_const)( BPLUS_(private_t) const * bplus,
686 4264239753 : ulong node_off ) {
687 4264239753 : return (BPLUS_(private_node_t) const *)((ulong)bplus + node_off);
688 4264239753 : }
689 :
690 : FD_FN_CONST static inline BPLUS_(private_leaf_t) const *
691 : BPLUS_(private_leaf_const)( BPLUS_(private_t) const * bplus,
692 7090158669 : ulong leaf_off ) {
693 7090158669 : return (BPLUS_(private_leaf_t) const *)((ulong)bplus + leaf_off);
694 7090158669 : }
695 :
696 : /* bplus_private_off returns the bplus global offset for the given
697 : address in the caller's address space. Assumes bplus is valid and
698 : addr is non-NULL and into the bplus memory region. */
699 :
700 : FD_FN_CONST static inline ulong
701 : BPLUS_(private_off)( BPLUS_(private_t) const * bplus,
702 3380268 : void const * addr ) {
703 3380268 : return (ulong)addr - (ulong)bplus;
704 3380268 : }
705 :
706 : /* bplus_private_node_acquire acquires a node from the bplus's node pool
707 : and returns a pointer to it in the caller's address space. Assumes
708 : bplus is valid. Returns NULL if bplus node pool is empty.
709 :
710 : bplus_private_node_release releases a node to the bplus's node pool.
711 : Assumes bplus is valid, node is valid and node is not currently in
712 : the pool.
713 :
714 : Similarly for bplus_private_leaf_{acquire,release}. */
715 :
716 : static inline BPLUS_(private_node_t) *
717 232035 : BPLUS_(private_node_acquire)( BPLUS_(private_t) * bplus ) {
718 232035 : ulong node_off = bplus->node_pool_off;
719 232035 : if( FD_UNLIKELY( !node_off ) ) return NULL;
720 232035 : BPLUS_(private_node_t *) node = BPLUS_(private_node)( bplus, node_off );
721 232035 : bplus->node_pool_off = node->tree_off[0];
722 232035 : return node;
723 232035 : }
724 :
725 : static inline void
726 : BPLUS_(private_node_release)( BPLUS_(private_t) * bplus,
727 236961 : BPLUS_(private_node_t) * node ) {
728 236961 : node->tree_off[0] = bplus->node_pool_off;
729 236961 : bplus->node_pool_off = BPLUS_(private_off)( bplus, node );
730 236961 : }
731 :
732 : static inline BPLUS_(private_leaf_t) *
733 967224 : BPLUS_(private_leaf_acquire)( BPLUS_(private_t) * bplus ) {
734 967224 : ulong leaf_off = bplus->leaf_pool_off;
735 967224 : if( FD_UNLIKELY( !leaf_off ) ) return NULL;
736 967221 : BPLUS_(private_leaf_t *) leaf = BPLUS_(private_leaf)( bplus, leaf_off );
737 967221 : bplus->leaf_pool_off = leaf->next_off;
738 967221 : return leaf;
739 967224 : }
740 :
741 : static inline void
742 : BPLUS_(private_leaf_release)( BPLUS_(private_t) * bplus,
743 976146 : BPLUS_(private_leaf_t) * leaf ) {
744 976146 : leaf->next_off = bplus->leaf_pool_off;
745 976146 : bplus->leaf_pool_off = BPLUS_(private_off)( bplus, leaf );
746 976146 : }
747 :
748 : /* bplus_private_insert inserts or upserts a key into a bplus tree.
749 : Assumes join is a current local join and key points to a valid key in
750 : the caller's address space and upsert is in [0,1].
751 :
752 : upsert 0: key will inserted into bplus. On success, returns the pair
753 : where key was inserted and, on return, *_insert will be 1. Caller
754 : can update all fields in the pair except the key. Lifetime of the
755 : returned pointer is until the next insert / upsert / remove. Returns
756 : NULL if there was no room in the bplus tree or if key was already in
757 : the bplus tree (might have moved pairs around in bplus tree on
758 : failure) and _insert will be untouched.
759 :
760 : upsert 1: key will inserted or updated into bplus. If key is already
761 : present in the bplus tree, returns the location in the caller's
762 : address space of the pair with the matching key and, on return,
763 : *_insert will be 0. If not, inserts the key and requires the
764 : location in the caller's address space where pair was inserted and,
765 : on return, *_insert will be 1. In both cases, the lifetime of the
766 : returned pointer is until the next insert / upsert / remove. Returns
767 : NULL if there was no room in the bplus tree to insert (might have
768 : moved pairs around in bplus tree on failure) and _insert will be
769 : untouched.
770 :
771 : The bplus retains no interest in query on return. */
772 :
773 : BPLUS_STATIC BPLUS_PAIR_T *
774 : BPLUS_(private_insert)( BPLUS_(t) * join,
775 : BPLUS_KEY_T const * key,
776 : int upsert,
777 : int * _insert );
778 :
779 : /* bplus_private_iter returns the iterator corresponding to query and
780 : op. Assumes join is a current local join, query points to a valid
781 : key in the caller's address space or is NULL and op is in [0,3].
782 : Returns an iter positioned at:
783 :
784 : op | position
785 : -------+-------------------------------------------------------------------------------------------------------------------
786 : 0 (GE) | the first pair with a key greater than or equal to query (or nul if all have keys less than query)
787 : 1 (GT) | the first pair with a key greater than query (or nul if all have keys less than or equal to query)
788 : 2 (LE) | the last pair with a key less than or equal to query (or nul if all have keys greater than query)
789 : 3 (LT) | the last pair with a key less than query (or nul if all have keys greater than or equal to query)
790 :
791 : If query is NULL, iteration will be positioned as though:
792 :
793 : op | query
794 : -------+-------
795 : 0 (GE) | +inf
796 : 1 (GT) | +inf
797 : 2 (LE) | -inf
798 : 3 (LT) | -inf
799 :
800 : The bplus retains no interest in query on return. */
801 :
802 : FD_FN_PURE BPLUS_STATIC BPLUS_(iter_t)
803 : BPLUS_(private_iter)( BPLUS_(t) const * join,
804 : BPLUS_KEY_T const * query,
805 : int op );
806 :
807 : FD_PROTOTYPES_END
808 :
809 : /* End internal use only */
810 :
811 : FD_PROTOTYPES_BEGIN
812 :
813 : /* Constructors */
814 :
815 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(leaf_max_est)( ulong ele_max_est );
816 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(node_max_est)( ulong ele_max_est );
817 :
818 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(align) ( void );
819 : FD_FN_CONST BPLUS_STATIC ulong BPLUS_(footprint)( ulong node_max, ulong leaf_max );
820 :
821 : BPLUS_STATIC void * BPLUS_(new) ( void * shmem, ulong node_max, ulong leaf_max );
822 : BPLUS_STATIC BPLUS_(t) * BPLUS_(join) ( void * shbplus );
823 : BPLUS_STATIC void * BPLUS_(leave) ( BPLUS_(t) * join );
824 : BPLUS_STATIC void * BPLUS_(delete)( void * shbplus );
825 :
826 : /* Accessors */
827 :
828 1873254 : FD_FN_PURE static inline ulong BPLUS_(node_max)( BPLUS_(t) const * join ) { return BPLUS_(private_const)( join )->node_max; }
829 1873254 : FD_FN_PURE static inline ulong BPLUS_(leaf_max)( BPLUS_(t) const * join ) { return BPLUS_(private_const)( join )->leaf_max; }
830 :
831 16869642 : FD_FN_PURE static inline int BPLUS_(is_empty)( BPLUS_(t) const * join ) { return !BPLUS_(private_const)( join )->root_off; }
832 :
833 : FD_FN_PURE static inline BPLUS_PAIR_T const *
834 7497702 : BPLUS_(min_const)( BPLUS_(t) const * join ) {
835 7497702 : BPLUS_(t) const * bplus = BPLUS_(private_const)( join );
836 7497702 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, bplus->leaf_min_off );
837 7497702 : return &leaf->pair[0];
838 7497702 : }
839 :
840 : FD_FN_PURE static inline BPLUS_PAIR_T const *
841 7497702 : BPLUS_(max_const)( BPLUS_(t) const * join ) {
842 7497702 : BPLUS_(t) const * bplus = BPLUS_(private_const)( join );
843 7497702 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, bplus->leaf_max_off );
844 7497702 : return &leaf->pair[ leaf->pair_cnt-1UL ];
845 7497702 : }
846 :
847 3748851 : FD_FN_PURE static inline BPLUS_PAIR_T * BPLUS_(min)( BPLUS_(t) * join ) { return (BPLUS_PAIR_T *)BPLUS_(min_const)( join ); }
848 3748851 : FD_FN_PURE static inline BPLUS_PAIR_T * BPLUS_(max)( BPLUS_(t) * join ) { return (BPLUS_PAIR_T *)BPLUS_(max_const)( join ); }
849 :
850 : FD_FN_PURE BPLUS_STATIC BPLUS_PAIR_T const * BPLUS_(query_const)( BPLUS_(t) const * join, BPLUS_KEY_T const * query );
851 :
852 : FD_FN_PURE static inline BPLUS_PAIR_T *
853 : BPLUS_(query)( BPLUS_(t) * join,
854 13124622 : BPLUS_KEY_T const * query ) {
855 13124622 : return (BPLUS_PAIR_T *)BPLUS_(query_const)( join, query );
856 13124622 : }
857 :
858 : /* Operations */
859 :
860 : static inline BPLUS_PAIR_T *
861 : BPLUS_(insert)( BPLUS_(t) * join,
862 3773271 : BPLUS_KEY_T const * key ) {
863 3773271 : int dummy;
864 3773271 : return BPLUS_(private_insert)( join, key, 0, &dummy );
865 3773271 : }
866 :
867 : static inline BPLUS_PAIR_T *
868 : BPLUS_(upsert)( BPLUS_(t) * join,
869 : BPLUS_KEY_T const * key,
870 3751845 : int * _opt_insert ) {
871 3751845 : int dummy;
872 3751845 : if( !_opt_insert ) _opt_insert = &dummy; /* compile time */
873 3751845 : return BPLUS_(private_insert)( join, key, 1, _opt_insert );
874 3751845 : }
875 :
876 : BPLUS_STATIC int BPLUS_(remove_key)( BPLUS_(t) * join, BPLUS_KEY_T const * key );
877 :
878 1873158 : static inline void BPLUS_(remove)( BPLUS_(t) * join, BPLUS_PAIR_T * pair ) { BPLUS_(remove_key)( join, &pair->BPLUS_PAIR_KEY ); }
879 :
880 : BPLUS_STATIC void BPLUS_(flush)( BPLUS_(t) * join );
881 :
882 : FD_FN_PURE BPLUS_STATIC int BPLUS_(verify)( BPLUS_(t) const * join );
883 :
884 : /* Iteration */
885 : /* FIXME: FD_FN_CONST for nul/eq/eq_nul/pair/pair_const? FD_FN_PURE for
886 : min/max/prev/next? */
887 :
888 : static inline BPLUS_(iter_t)
889 1875756 : BPLUS_(iter_nul)( BPLUS_(t) const * join ) {
890 1875756 : (void)join;
891 1875756 : BPLUS_(iter_t) iter;
892 1875756 : iter.leaf_off = 0UL;
893 1875756 : iter.pair_idx = 0UL;
894 1875756 : return iter;
895 1875756 : }
896 :
897 : static inline BPLUS_(iter_t)
898 1875756 : BPLUS_(iter_min)( BPLUS_(t) const * join ) {
899 1875756 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
900 1875756 : ulong leaf_off = bplus->leaf_min_off;
901 1875756 : BPLUS_(iter_t) iter;
902 1875756 : iter.leaf_off = leaf_off;
903 1875756 : iter.pair_idx = 0UL;
904 1875756 : return iter;
905 1875756 : }
906 :
907 : static inline BPLUS_(iter_t)
908 1875756 : BPLUS_(iter_max)( BPLUS_(t) const * join ) {
909 1875756 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
910 1875756 : ulong leaf_off = bplus->leaf_max_off;
911 1875756 : BPLUS_(iter_t) iter;
912 1875756 : iter.leaf_off = leaf_off;
913 1875756 : iter.pair_idx = (FD_UNLIKELY( !leaf_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, leaf_off )->pair_cnt) - 1UL;
914 1875756 : return iter;
915 1875756 : }
916 :
917 : FD_FN_PURE static inline BPLUS_(iter_t)
918 : BPLUS_(iter_ge)( BPLUS_(t) const * join,
919 3749115 : BPLUS_KEY_T const * query ) {
920 3749115 : return BPLUS_(private_iter)( join, query, 0 );
921 3749115 : }
922 :
923 : FD_FN_PURE static inline BPLUS_(iter_t)
924 : BPLUS_(iter_gt)( BPLUS_(t) const * join,
925 3749115 : BPLUS_KEY_T const * query ) {
926 3749115 : return BPLUS_(private_iter)( join, query, 1 );
927 3749115 : }
928 :
929 : FD_FN_PURE static inline BPLUS_(iter_t)
930 : BPLUS_(iter_le)( BPLUS_(t) const * join,
931 3749115 : BPLUS_KEY_T const * query ) {
932 3749115 : return BPLUS_(private_iter)( join, query, 2 );
933 3749115 : }
934 :
935 : FD_FN_PURE static inline BPLUS_(iter_t)
936 : BPLUS_(iter_lt)( BPLUS_(t) const * join,
937 3749115 : BPLUS_KEY_T const * query ) {
938 3749115 : return BPLUS_(private_iter)( join, query, 3 );
939 3749115 : }
940 :
941 : static inline int
942 : BPLUS_(iter_eq)( BPLUS_(t) const * join,
943 : BPLUS_(iter_t) iter,
944 11251554 : BPLUS_(iter_t) fini ) {
945 11251554 : (void)join;
946 11251554 : return (iter.leaf_off==fini.leaf_off) & (iter.pair_idx==fini.pair_idx);
947 11251554 : }
948 :
949 : static inline int
950 : BPLUS_(iter_eq_nul)( BPLUS_(t) const * join,
951 14998770 : BPLUS_(iter_t) iter ) {
952 14998770 : (void)join;
953 14998770 : return !iter.leaf_off;
954 14998770 : }
955 :
956 : static inline BPLUS_(iter_t)
957 : BPLUS_(iter_next)( BPLUS_(t) const * join,
958 3751176 : BPLUS_(iter_t) iter ) {
959 3751176 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
960 :
961 3751176 : ulong leaf_off = iter.leaf_off;
962 3751176 : ulong pair_idx = iter.pair_idx;
963 :
964 3751176 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
965 :
966 3751176 : pair_idx++;
967 3751176 : if( FD_UNLIKELY( pair_idx>=leaf->pair_cnt ) ) { /* optimize for high radix */
968 3751176 : leaf_off = leaf->next_off;
969 3751176 : pair_idx = 0UL;
970 3751176 : }
971 :
972 3751176 : iter.leaf_off = leaf_off;
973 3751176 : iter.pair_idx = pair_idx;
974 3751176 : return iter;
975 3751176 : }
976 :
977 : static inline BPLUS_(iter_t)
978 : BPLUS_(iter_prev)( BPLUS_(t) const * join,
979 1875681 : BPLUS_(iter_t) iter ) {
980 1875681 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
981 :
982 1875681 : ulong leaf_off = iter.leaf_off;
983 1875681 : ulong pair_idx = iter.pair_idx;
984 :
985 1875681 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
986 :
987 1875681 : if( FD_UNLIKELY( !pair_idx ) ) { /* optimize for high radix */
988 1875681 : leaf_off = leaf->prev_off;
989 1875681 : pair_idx = FD_UNLIKELY( !leaf_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, leaf_off )->pair_cnt;
990 1875681 : }
991 1875681 : pair_idx--;
992 :
993 1875681 : iter.leaf_off = leaf_off;
994 1875681 : iter.pair_idx = pair_idx;
995 1875681 : return iter;
996 1875681 : }
997 :
998 : static inline BPLUS_PAIR_T const *
999 : BPLUS_(iter_pair_const)( BPLUS_(t) const * join,
1000 11243061 : BPLUS_(iter_t) iter ) {
1001 11243061 : return BPLUS_(private_leaf_const)( BPLUS_(private_const)( join ), iter.leaf_off )->pair + iter.pair_idx;
1002 11243061 : }
1003 :
1004 : static inline BPLUS_PAIR_T *
1005 : BPLUS_(iter_pair)( BPLUS_(t) * join,
1006 3751362 : BPLUS_(iter_t) iter ) {
1007 3751362 : return BPLUS_(private_leaf)( BPLUS_(private)( join ), iter.leaf_off )->pair + iter.pair_idx;
1008 3751362 : }
1009 :
1010 : FD_PROTOTYPES_END
1011 :
1012 : #endif
1013 :
1014 : #if BPLUS_IMPL_STYLE==0 || BPLUS_IMPL_STYLE==2
1015 :
1016 : /* Implementation *****************************************************/
1017 :
1018 : /* bplus_private_node_query returns the index of a node's child tree,
1019 : in [0,tree_cnt), that might contain query.
1020 :
1021 : tree 0 covers keys [ -inf, pivot[0] )
1022 : i covers keys [ pivot[i-1], pivot[i] )
1023 : tree_cnt-1 covers keys [ pivot[tree_cnt-2], +inf )
1024 :
1025 : Assumes pivot contains unique keys in ascending order, tree_cnt is in
1026 : [2,tree_max], tree_max <<< ULONG_MAX and query is valid. */
1027 :
1028 : FD_FN_PURE static ulong
1029 : BPLUS_(private_node_query)( BPLUS_KEY_T const * FD_RESTRICT pivot,
1030 : ulong tree_cnt,
1031 232074357 : BPLUS_KEY_T const * FD_RESTRICT query ) {
1032 232074357 : ulong i0 = 0UL;
1033 232074357 : ulong i1 = tree_cnt;
1034 :
1035 431911257 : do {
1036 :
1037 : /* At this point, query might be found in trees in [i0,i1) and this
1038 : range contains at least two trees. Test the middle tree. If it
1039 : matches exactly, we are done. Otherwise, recurse on the
1040 : appropriate half of the range. */
1041 :
1042 431911257 : ulong im = (i0+i1) >> 1; /* No overflow, at least 1 */
1043 :
1044 431911257 : int cmp = BPLUS_(private_key_cmp)( query, &pivot[im-1UL] );
1045 431911257 : if( FD_UNLIKELY( !cmp ) ) return im; /* (optional) early abort, optimize for big trees */
1046 426659160 : i0 = fd_ulong_if( cmp<0, i0, im );
1047 426659160 : i1 = fd_ulong_if( cmp<0, im, i1 );
1048 :
1049 426659160 : } while( FD_LIKELY( (i1-i0)>1UL) ); /* optimize for big trees */
1050 :
1051 226822260 : return i0;
1052 232074357 : }
1053 :
1054 : /* bplus_private_pair_query returns the index of a leaf's pair, in
1055 : [0,pair_cnt), that exactly matches query or pair if there is no
1056 : matching pair. Assumes pair keys are unique and ascending sorted,
1057 : pair_cnt is in [1,pair_max], pair_max <<< ULONG_MAX and query is
1058 : valid. */
1059 :
1060 : FD_FN_PURE static ulong
1061 : BPLUS_(private_pair_query)( BPLUS_PAIR_T const * FD_RESTRICT pair,
1062 : ulong pair_cnt,
1063 20654814 : BPLUS_KEY_T const * FD_RESTRICT query ) {
1064 20654814 : ulong i0 = 0UL;
1065 20654814 : ulong i1 = pair_cnt;
1066 :
1067 33977313 : do {
1068 :
1069 : /* At this point, query might match one of the pairs in [i0,i1) and
1070 : this range is not empty. Test the pair in the middle. If it
1071 : matches, we found the pair. Otherwise, recurse appropriate half
1072 : of the range (exclusive of our query). */
1073 :
1074 33977313 : ulong im = (i0+i1) >> 1; /* No overflow */
1075 :
1076 33977313 : int cmp = BPLUS_(private_key_cmp)( query, &pair[im].BPLUS_PAIR_KEY );
1077 33977313 : if( FD_UNLIKELY( !cmp ) ) return im; /* Found, optimize for big trees */
1078 20820888 : i0 = fd_ulong_if( cmp<0, i0, im+1UL );
1079 20820888 : i1 = fd_ulong_if( cmp<0, im, i1 );
1080 :
1081 20820888 : } while( FD_LIKELY( i1-i0 ) ); /* optimize for big trees */
1082 :
1083 7498389 : return pair_cnt; /* not found */
1084 20654814 : }
1085 :
1086 : /* bplus_private_child_insert inserts a child at position child_idx into
1087 : parent. Parent should have a tree_cnt in [1,tree_max-1] and
1088 : child_idx should be in [1,tree_cnt] (such that the child is never
1089 : inserted into a parent with no children or a parent with the maximum
1090 : number of children and is never inserted as the first born child).
1091 : child_off is the bplus global offset of the child. This can be a
1092 : node or leaf but it should match parent's current children.
1093 : child_pivot is the pivot value associated with the child and the
1094 : child_idx should preserve the parent's pivot sorting. Further, child
1095 : should not contain any keys that outside the parent's pivot range
1096 : after the insert. */
1097 :
1098 : static void
1099 : BPLUS_(private_child_insert)( BPLUS_(private_node_t) * FD_RESTRICT parent,
1100 : ulong child_idx,
1101 : ulong child_off,
1102 1198194 : BPLUS_KEY_T const * FD_RESTRICT child_pivot ) {
1103 1198194 : ulong tree_cnt = parent->tree_cnt;
1104 1198194 : ulong * FD_RESTRICT tree_off = parent->tree_off;
1105 1198194 : BPLUS_KEY_T * FD_RESTRICT pivot = parent->pivot;
1106 :
1107 : /* Make room for child at child_idx by shifting childen currently at
1108 : or after child_idx up one. */
1109 :
1110 2885352 : for( ulong sibling_idx=tree_cnt; sibling_idx>child_idx; sibling_idx-- ) {
1111 1687158 : tree_off[sibling_idx ] = tree_off[sibling_idx-1UL];
1112 1687158 : pivot [sibling_idx-1UL] = pivot [sibling_idx-2UL];
1113 1687158 : }
1114 :
1115 : /* Insert the child at child_idx */
1116 :
1117 1198194 : tree_off[child_idx ] = child_off;
1118 1198194 : pivot [child_idx-1UL] = child_pivot[0];
1119 :
1120 1198194 : parent->tree_cnt = tree_cnt + 1UL; /* In [2,tree_max] */
1121 1198194 : }
1122 :
1123 : /* bplus_private_child_remove removes the child child_idx from the bplus
1124 : node parent. Assumes parent is valid with a tree cnt in [2,tree_max]
1125 : and that child is in [1,tree_cnt) (as such, this will never remove
1126 : the first born child). */
1127 :
1128 : static void
1129 : BPLUS_(private_child_remove)( BPLUS_(private_node_t) * FD_RESTRICT parent,
1130 1193622 : ulong child_idx ) {
1131 1193622 : ulong tree_cnt = parent->tree_cnt;
1132 1193622 : ulong * FD_RESTRICT tree_off = parent->tree_off;
1133 1193622 : BPLUS_KEY_T * FD_RESTRICT pivot = parent->pivot;
1134 :
1135 : /* Fill the hole at child_idx by shifting childen currently at or
1136 : after child_idx down one. */
1137 :
1138 1193622 : tree_cnt--;
1139 2689209 : for( ulong sibling_idx=child_idx; sibling_idx<tree_cnt; sibling_idx++ ) {
1140 1495587 : tree_off[sibling_idx ] = tree_off[sibling_idx+1UL];
1141 1495587 : pivot [sibling_idx-1UL] = pivot [sibling_idx ];
1142 1495587 : }
1143 :
1144 1193622 : parent->tree_cnt = tree_cnt; /* In [1,tree_max-1] */
1145 1193622 : }
1146 :
1147 : ulong
1148 18 : BPLUS_(leaf_max_est)( ulong ele_max_est ) {
1149 :
1150 : /* No leaves needed for always empty trees */
1151 :
1152 18 : if( FD_UNLIKELY( !ele_max_est ) ) return 0UL;
1153 :
1154 : /* Trivial bplus trees have just a root leaf */
1155 :
1156 12 : if( FD_UNLIKELY( ele_max_est<=BPLUS_(private_pair_max)() ) ) return 1UL;
1157 :
1158 : /* In a non-trivial bplus tree, each leaf has at least
1159 : pair_min==pair_max/2 elements. So, we require:
1160 :
1161 : leaf_max*pair_min >= ele_max_est
1162 : -> leaf_max >= ele_max_est / pair_min
1163 :
1164 : The smallest leaf_max that satisfies this is:
1165 :
1166 : ceil( ele_max_est / pair_min )
1167 : -> floor( (ele_max_est + pair_min - 1) / pair_min )
1168 : -> 1 + floor( (ele_max_est - 1) / pair_min */
1169 :
1170 6 : return 1UL + ((ele_max_est-1UL) / BPLUS_(private_pair_min)()); /* No overflow */
1171 12 : }
1172 :
1173 : ulong
1174 9 : BPLUS_(node_max_est)( ulong ele_max_est ) {
1175 :
1176 : /* Start at the leaf layer with leaf_max trees */
1177 :
1178 9 : ulong node_max = 0UL;
1179 9 : ulong tree_cnt = BPLUS_(leaf_max_est)( ele_max_est );
1180 :
1181 30 : while( tree_cnt>1UL ) {
1182 :
1183 : /* At this point, we have more than one tree in the current layer.
1184 : To reduce the number of trees, we create a new layer of nodes
1185 : above it and make each new node responsible for up to
1186 : tree_min==tree_max/2 of the trees in the current layer to give a
1187 : reasonably tight bound to the worst case. That implies this new
1188 : layer will need at most:
1189 :
1190 : ceil( tree_cnt / tree_min )
1191 : -> floor( (tree_cnt + tree_min - 1) / tree_min )
1192 : -> 1 + floor( (tree_cnt - 1) / tree_min )
1193 :
1194 : nodes and this layer will reduce to the number of trees to the
1195 : same number. */
1196 :
1197 21 : tree_cnt = 1UL + ((tree_cnt-1UL) / BPLUS_(private_tree_min)()); /* No overflow */
1198 21 : node_max += tree_cnt;
1199 :
1200 21 : }
1201 :
1202 9 : return node_max;
1203 9 : }
1204 :
1205 : ulong
1206 0 : BPLUS_(align)( void ) {
1207 0 : return BPLUS_ALIGN;
1208 0 : }
1209 :
1210 : ulong
1211 : BPLUS_(footprint)( ulong node_max,
1212 18 : ulong leaf_max ) {
1213 :
1214 18 : if( FD_UNLIKELY( (node_max > BPLUS_(private_node_max_max)()) | (leaf_max > BPLUS_(private_leaf_max_max)()) ) ) return 0UL;
1215 :
1216 : /* At this point, the needed node and leaf storage is at most 2^63,
1217 : which is impractically large but also with plenty of room left over
1218 : for the metadata and remaining alignment padding. */
1219 :
1220 6 : ulong off = 0UL; /**/ off += sizeof( BPLUS_(private_t) );
1221 6 : off = fd_ulong_align_up( off, BPLUS_NODE_ALIGN ); /*ulong node_lo = off;*/ off += node_max*sizeof( BPLUS_(private_node_t) );
1222 6 : off = fd_ulong_align_up( off, BPLUS_LEAF_ALIGN ); /*ulong leaf_lo = off;*/ off += leaf_max*sizeof( BPLUS_(private_leaf_t) );
1223 6 : off = fd_ulong_align_up( off, BPLUS_ALIGN );
1224 :
1225 6 : return off;
1226 18 : }
1227 :
1228 : void
1229 6 : BPLUS_(flush)( BPLUS_(t) * bplus ) {
1230 6 : bplus->node_pool_off = 0UL;
1231 6 : bplus->leaf_pool_off = 0UL;
1232 6 : bplus->root_off = 0UL;
1233 6 : bplus->leaf_min_off = 0UL;
1234 6 : bplus->leaf_max_off = 0UL;
1235 :
1236 6 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, bplus->node_lo );
1237 6162 : for( ulong node_rem=bplus->node_max; node_rem; node_rem-- ) BPLUS_(private_node_release)( bplus, &node[ node_rem-1UL ] );
1238 :
1239 6 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, bplus->leaf_lo );
1240 12294 : for( ulong leaf_rem=bplus->leaf_max; leaf_rem; leaf_rem-- ) BPLUS_(private_leaf_release)( bplus, &leaf[ leaf_rem-1UL ] );
1241 6 : }
1242 :
1243 : void *
1244 : BPLUS_(new)( void * shmem,
1245 : ulong node_max,
1246 15 : ulong leaf_max ) {
1247 15 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shmem;
1248 :
1249 15 : if( FD_UNLIKELY( !bplus ) ) {
1250 3 : FD_LOG_WARNING(( "NULL shmem" ));
1251 3 : return NULL;
1252 3 : }
1253 :
1254 12 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1255 3 : FD_LOG_WARNING(( "misaligned shmem" ));
1256 3 : return NULL;
1257 3 : }
1258 :
1259 9 : ulong footprint = BPLUS_(footprint)( node_max, leaf_max );
1260 9 : if( FD_UNLIKELY( !footprint ) ) {
1261 6 : FD_LOG_WARNING(( "bad node_max and/or leaf_max" ));
1262 6 : return NULL;
1263 6 : }
1264 :
1265 : /* Note: it is the caller's responsibility to clear the memory because
1266 : it is potentially very big and very time consuming to do so and may
1267 : already have been cleared (e.g. mmap from the OS) */
1268 :
1269 3 : ulong off;
1270 3 : off = 0UL; /**/ off += sizeof( BPLUS_(private_t) );
1271 3 : off = fd_ulong_align_up( off, BPLUS_NODE_ALIGN ); ulong node_lo = off; off += node_max*sizeof( BPLUS_(private_node_t) );
1272 3 : off = fd_ulong_align_up( off, BPLUS_LEAF_ALIGN ); ulong leaf_lo = off; off += leaf_max*sizeof( BPLUS_(private_leaf_t) );
1273 3 : off = fd_ulong_align_up( off, BPLUS_ALIGN );
1274 :
1275 3 : bplus->node_max = node_max; bplus->leaf_max = leaf_max;
1276 3 : bplus->node_lo = node_lo; bplus->leaf_lo = leaf_lo;
1277 :
1278 3 : BPLUS_(flush)( bplus );
1279 :
1280 3 : FD_COMPILER_MFENCE();
1281 3 : bplus->magic = BPLUS_MAGIC;
1282 3 : FD_COMPILER_MFENCE();
1283 :
1284 3 : return shmem;
1285 9 : }
1286 :
1287 : BPLUS_(t) *
1288 12 : BPLUS_(join)( void * shbplus ) {
1289 12 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shbplus;
1290 :
1291 12 : if( FD_UNLIKELY( !bplus ) ) {
1292 3 : FD_LOG_WARNING(( "NULL shbplus" ));
1293 3 : return NULL;
1294 3 : }
1295 :
1296 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1297 3 : FD_LOG_WARNING(( "misaligned shbplus" ));
1298 3 : return NULL;
1299 3 : }
1300 :
1301 6 : if( FD_UNLIKELY( bplus->magic!=BPLUS_MAGIC ) ) {
1302 3 : FD_LOG_WARNING(( "bad magic" ));
1303 3 : return NULL;
1304 3 : }
1305 :
1306 3 : return (BPLUS_(t) *)bplus;
1307 6 : }
1308 :
1309 : void *
1310 6 : BPLUS_(leave)( BPLUS_(t) * join ) {
1311 6 : if( FD_UNLIKELY( !join ) ) {
1312 3 : FD_LOG_WARNING(( "NULL join" ));
1313 3 : return NULL;
1314 3 : }
1315 :
1316 3 : return (void *)join;
1317 6 : }
1318 :
1319 : void *
1320 12 : BPLUS_(delete)( void * shbplus ) {
1321 12 : BPLUS_(private_t) * bplus = (BPLUS_(private_t) *)shbplus;
1322 :
1323 12 : if( FD_UNLIKELY( !bplus ) ) {
1324 3 : FD_LOG_WARNING(( "NULL shbplus" ));
1325 3 : return NULL;
1326 3 : }
1327 :
1328 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) ) ) {
1329 3 : FD_LOG_WARNING(( "misaligned shbplus" ));
1330 3 : return NULL;
1331 3 : }
1332 :
1333 6 : if( FD_UNLIKELY( bplus->magic!=BPLUS_MAGIC ) ) {
1334 3 : FD_LOG_WARNING(( "bad magic" ));
1335 3 : return NULL;
1336 3 : }
1337 :
1338 3 : FD_COMPILER_MFENCE();
1339 3 : bplus->magic = 0UL;
1340 3 : FD_COMPILER_MFENCE();
1341 :
1342 3 : return (void *)bplus;
1343 6 : }
1344 :
1345 : BPLUS_PAIR_T const *
1346 : BPLUS_(query_const)( BPLUS_(t) const * join,
1347 15014253 : BPLUS_KEY_T const * query ) {
1348 15014253 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
1349 :
1350 : /* If an empty bplus tree, not found */
1351 :
1352 15014253 : ulong tree_off = bplus->root_off;
1353 15014253 : if( FD_UNLIKELY( !tree_off ) ) return NULL; /* optimize for big trees */
1354 :
1355 : /* At this point, the bplus tree is not empty. Find the leaf that
1356 : might contain query. */
1357 :
1358 15013974 : ulong leaf_lo = bplus->leaf_lo;
1359 95716287 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* optimize for big trees */
1360 80702313 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
1361 80702313 : tree_off = node->tree_off[ BPLUS_(private_node_query)( node->pivot, node->tree_cnt, query ) ];
1362 80702313 : }
1363 15013974 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
1364 :
1365 : /* At this point, leaf might contain query. Query the leaf */
1366 :
1367 15013974 : pair_t const * pair = leaf->pair;
1368 15013974 : ulong pair_cnt = leaf->pair_cnt;
1369 15013974 : ulong pair_idx = BPLUS_(private_pair_query)( pair, pair_cnt, query );
1370 :
1371 15013974 : return fd_ptr_if( pair_idx<pair_cnt, &pair[ pair_idx ], NULL );
1372 15014253 : }
1373 :
1374 : BPLUS_PAIR_T *
1375 : BPLUS_(private_insert)( BPLUS_(t) * join,
1376 : BPLUS_KEY_T const * key,
1377 : int upsert,
1378 7525116 : int * _insert ) {
1379 7525116 : BPLUS_(private_t) * bplus = BPLUS_(private)( join );
1380 :
1381 : /* If the bplus tree is empty, create the root leaf and insert the key
1382 : into it */
1383 :
1384 7525116 : ulong tree_off = bplus->root_off;
1385 7525116 : if( FD_UNLIKELY( !tree_off ) ) { /* Empty bplus, optimize for big */
1386 :
1387 189 : BPLUS_(private_leaf_t) * root = BPLUS_(private_leaf_acquire)( bplus );
1388 189 : if( FD_UNLIKELY( !root ) ) return NULL; /* no room for insert */
1389 :
1390 189 : root->prev_off = 0UL;
1391 189 : root->next_off = 0UL;
1392 189 : root->pair_cnt = 1UL;
1393 189 : root->pair[0].BPLUS_PAIR_KEY = key[0];
1394 189 : ulong root_off = BPLUS_(private_off)( bplus, root );
1395 189 : bplus->root_off = root_off;
1396 189 : bplus->leaf_min_off = root_off;
1397 189 : bplus->leaf_max_off = root_off;
1398 :
1399 189 : *_insert = 1;
1400 189 : return &root->pair[0];
1401 :
1402 189 : }
1403 :
1404 : /* At this point, the bplus tree is not empty. We recurse through
1405 : interior nodes to find the leaf that should hold key, splitting
1406 : interior nodes as we go. */
1407 :
1408 7524927 : ulong tree_min = BPLUS_(private_tree_min)();
1409 7524927 : ulong tree_max = BPLUS_(private_tree_max)(); /* ==tree_min*2 */
1410 :
1411 7524927 : BPLUS_(private_node_t) * parent = NULL;
1412 7524927 : ulong child_idx = 0UL;
1413 :
1414 7524927 : ulong leaf_lo = bplus->leaf_lo;
1415 47962443 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* Optimize for big trees */
1416 40437516 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, tree_off );
1417 :
1418 : /* At this point, we should insert key into one of the node's trees
1419 : and tree_cnt is in [2,tree_max] (root) or [tree_min,tree_max]
1420 : (non-root). If the node has a parent, parent and all node's
1421 : siblings are nodes and parent has in [2,tree_max-1] (root parent)
1422 : or [tree_min,tree_max-1] (non-root parent) children. (tree_max-1
1423 : because if it had tree_max children when insert started, we would
1424 : have split it on the previous iteration).
1425 :
1426 : If the node is full, split it. */
1427 :
1428 40437516 : ulong tree_cnt = node->tree_cnt;
1429 40437516 : if( FD_UNLIKELY( tree_cnt==tree_max ) ) { /* Optimize for high radix */
1430 :
1431 : /* Acquire resources. If node is the root, this includes making a
1432 : new root node and making the new root node's parent. */
1433 :
1434 231162 : BPLUS_(private_node_t) * new_node = BPLUS_(private_node_acquire)( bplus );
1435 231162 : if( FD_UNLIKELY( !new_node ) ) return NULL; /* No room for insert */
1436 :
1437 231162 : if( FD_UNLIKELY( !parent ) ) {
1438 714 : parent = BPLUS_(private_node_acquire)( bplus );
1439 714 : if( FD_UNLIKELY( !parent ) ) {
1440 0 : BPLUS_(private_node_release)( bplus, new_node );
1441 0 : return NULL; /* No room for insert */
1442 0 : }
1443 :
1444 714 : bplus->root_off = BPLUS_(private_off)( bplus, parent );
1445 :
1446 714 : parent->tree_cnt = 1UL; /* Will be incremented to 2 by the child_insert below. */
1447 714 : parent->tree_off[0] = BPLUS_(private_off)( bplus, node );
1448 :
1449 714 : child_idx = 0UL;
1450 714 : }
1451 :
1452 : /* At this point, node is child child_idx of parent and we need to
1453 : split node. Further, new_node is the node that will be created
1454 : by the split and parent has room to insert a link to new_node.
1455 : Split node evenly into new_node and update the parent
1456 : accordingly. */
1457 :
1458 231162 : BPLUS_KEY_T const * median = &node->pivot[ tree_min-1UL ];
1459 :
1460 231162 : node->tree_cnt = tree_min;
1461 :
1462 231162 : new_node->tree_cnt = tree_min;
1463 231162 : memcpy( new_node->tree_off, node->tree_off + tree_min, sizeof(ulong) * tree_min );
1464 231162 : memcpy( new_node->pivot, node->pivot + tree_min, sizeof(BPLUS_KEY_T)*(tree_min-1UL) );
1465 :
1466 231162 : BPLUS_(private_child_insert)( parent, child_idx+1UL, BPLUS_(private_off)( bplus, new_node ), median );
1467 :
1468 : /* Move into the appropriate split */
1469 :
1470 231162 : node = fd_ptr_if( BPLUS_(private_key_cmp)( key, median )<0, node, new_node );
1471 231162 : tree_cnt = tree_min;
1472 231162 : }
1473 :
1474 : /* At this point, we should insert key into one of the node's trees
1475 : and tree_cnt is in [2,tree_max-1] (root) or [tree_min,tree_max-1]
1476 : (non root) such that we are guaranteed to be able to insert. */
1477 :
1478 40437516 : parent = node;
1479 40437516 : child_idx = BPLUS_(private_node_query)( node->pivot, tree_cnt, key );
1480 40437516 : tree_off = node->tree_off[ child_idx ];
1481 40437516 : }
1482 :
1483 7524927 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, tree_off );
1484 :
1485 : /* At this point, we'd like to insert key into leaf. But if leaf is
1486 : full, we split it to make room. */
1487 :
1488 7524927 : ulong pair_min = BPLUS_(private_pair_min)();
1489 7524927 : ulong pair_max = BPLUS_(private_pair_max)(); /* ==pair_min*2 */
1490 :
1491 7524927 : ulong pair_cnt = (ulong)leaf->pair_cnt;
1492 7524927 : if( FD_UNLIKELY( pair_cnt==pair_max ) ) { /* optimize for high radix */
1493 :
1494 : /* Acquire resources. If leaf is the root, this includes making a
1495 : new root node and making the new root node's parent. */
1496 :
1497 967035 : BPLUS_(private_leaf_t) * new_leaf = BPLUS_(private_leaf_acquire)( bplus );
1498 967035 : if( FD_UNLIKELY( !new_leaf ) ) return NULL; /* No room for insert */
1499 :
1500 967032 : if( FD_UNLIKELY( !parent ) ) {
1501 159 : parent = BPLUS_(private_node_acquire)( bplus );
1502 159 : if( FD_UNLIKELY( !parent ) ) {
1503 0 : BPLUS_(private_leaf_release)( bplus, new_leaf );
1504 0 : return NULL; /* No room to insert */
1505 0 : }
1506 :
1507 159 : bplus->root_off = BPLUS_(private_off)( bplus, parent );
1508 :
1509 159 : parent->tree_cnt = 1UL; /* Will be incremented to 2 below */
1510 159 : parent->tree_off[0] = BPLUS_(private_off)( bplus, leaf );
1511 :
1512 159 : child_idx = 0UL;
1513 159 : }
1514 :
1515 : /* At this point, leaf is child child_idx of parent and we need to
1516 : split leaf. Further, new_leaf is the leaf that will be created
1517 : by the split and parent has room to insert a link to new_leaf.
1518 : Split leaf evenly into new_leaf and update the parent
1519 : accordingly. Splitting this leaf might make a new max leaf (it
1520 : will never make a new min leaf). */
1521 :
1522 967032 : BPLUS_KEY_T const * median = &leaf->pair[ pair_min ].BPLUS_PAIR_KEY;
1523 :
1524 967032 : ulong next_off = leaf->next_off;
1525 :
1526 967032 : leaf->pair_cnt = pair_min;
1527 967032 : leaf->next_off = BPLUS_(private_off)( bplus, new_leaf );
1528 :
1529 967032 : new_leaf->pair_cnt = pair_min;
1530 967032 : new_leaf->prev_off = BPLUS_(private_off)( bplus, leaf );
1531 967032 : new_leaf->next_off = next_off;
1532 967032 : memcpy( &new_leaf->pair[0], &leaf->pair[pair_min], sizeof(pair_t)*pair_min );
1533 :
1534 : /* FIXME: BRANCHLESS? */
1535 967032 : ulong new_leaf_off = BPLUS_(private_off)( bplus, new_leaf );
1536 967032 : if( FD_UNLIKELY( !next_off ) ) bplus->leaf_max_off = new_leaf_off;
1537 963618 : else BPLUS_(private_leaf)( bplus, next_off )->prev_off = new_leaf_off;
1538 :
1539 967032 : BPLUS_(private_child_insert)( parent, child_idx+1UL, new_leaf_off, median );
1540 :
1541 : /* Move into the appropriate split */
1542 :
1543 967032 : leaf = (BPLUS_(private_key_cmp)( key, median )<0) ? leaf : new_leaf;
1544 967032 : pair_cnt = pair_min;
1545 967032 : }
1546 :
1547 : /* At this point, leaf either contains key or is where we should
1548 : insert key. Further, pair_cnt is in [1,pair_max-1] (root) or
1549 : [pair_min,pair_max-1] (non root). Search for key in the leaf. If
1550 : key is not in the leaf, the search will reveal where to put the
1551 : key. */
1552 :
1553 7524924 : BPLUS_PAIR_T * pair = leaf->pair;
1554 7524924 : ulong i0 = 0UL;
1555 7524924 : ulong i1 = pair_cnt;
1556 12506436 : do {
1557 :
1558 : /* At this point, pairs in [0,i0) are before key, pairs in
1559 : [i1,pair_cnt) are after key and pairs in [i0,i1) (non-empty) are
1560 : not known. Probe the middle of this range for key. */
1561 :
1562 12506436 : ulong im = (i0+i1) >> 1; /* no overflow */
1563 :
1564 12506436 : int cmp = BPLUS_(private_key_cmp)( &pair[ im ].BPLUS_PAIR_KEY, key );
1565 :
1566 : /* If cmp==0, pair im holds the key and we are done. Otherwise, if
1567 : cmp<0 / cmp>0, pair im is before / after key. We adjust the
1568 : ranges appropriately and recurse. */
1569 :
1570 12506436 : if( FD_UNLIKELY( !cmp ) ) { /* optimize for big trees */
1571 3752997 : leaf->pair_cnt = pair_cnt;
1572 3752997 : if( !upsert ) return NULL; /* compile time */
1573 1875972 : *_insert = 0;
1574 1875972 : return &pair[ im ];
1575 3752997 : }
1576 8753439 : i0 = fd_ulong_if( cmp>0, i0, im+1UL );
1577 8753439 : i1 = fd_ulong_if( cmp>0, im, i1 );
1578 :
1579 8753439 : } while( i1>i0 );
1580 :
1581 : /* At this point, leaf does not contain key, pairs [0,i0) are before
1582 : key, pairs [i0,pair_cnt) are after key and we have room for key.
1583 : Move pairs [i0,pair_cnt) right 1 to make room and insert the key at
1584 : pair i0. */
1585 :
1586 3771927 : memmove( pair+i0+1UL, pair+i0, (pair_cnt-i0)*sizeof(BPLUS_PAIR_T) );
1587 3771927 : pair[ i0 ].BPLUS_PAIR_KEY = key[0];
1588 3771927 : leaf->pair_cnt = pair_cnt + 1UL;
1589 3771927 : *_insert = 1;
1590 3771927 : return &pair[ i0 ];
1591 7524924 : }
1592 :
1593 : int
1594 : BPLUS_(remove_key)( BPLUS_(t) * join,
1595 5640936 : BPLUS_KEY_T const * key ) {
1596 5640936 : BPLUS_(private_t) * bplus = BPLUS_(private)( join );
1597 :
1598 : /* If tree is empty, nothing to remove */
1599 :
1600 5640936 : ulong tree_off = bplus->root_off;
1601 5640936 : if( FD_UNLIKELY( !tree_off ) ) return -1; /* not found, optimize for found */
1602 :
1603 : /* At this point, the tree is not empty. Find the path through the
1604 : tree to the leaf with the key to remove. Note that 128 is more
1605 : than enough given strong lg N depth algorithmic guarantees and wide
1606 : radices. */
1607 :
1608 5640840 : BPLUS_(private_node_t) * path_node [ 128 ];
1609 5640840 : ulong path_tree_idx[ 128 ];
1610 5640840 : ulong path_cnt = 0UL;
1611 :
1612 5640840 : ulong leaf_lo = bplus->leaf_lo;
1613 35960184 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* optimize for big trees */
1614 30319344 : BPLUS_(private_node_t) * node = BPLUS_(private_node)( bplus, tree_off );
1615 :
1616 30319344 : ulong tree_idx = BPLUS_(private_node_query)( node->pivot, node->tree_cnt, key );
1617 :
1618 30319344 : path_node [ path_cnt ] = node;
1619 30319344 : path_tree_idx[ path_cnt ] = tree_idx;
1620 30319344 : path_cnt++;
1621 :
1622 30319344 : tree_off = node->tree_off[ tree_idx ];
1623 30319344 : }
1624 :
1625 5640840 : BPLUS_(private_leaf_t) * leaf = BPLUS_(private_leaf)( bplus, tree_off );
1626 :
1627 : /* At this point, leaf might contain key. Search for key. */
1628 :
1629 5640840 : BPLUS_PAIR_T * pair = leaf->pair;
1630 5640840 : ulong pair_cnt = leaf->pair_cnt;
1631 5640840 : ulong pair_idx = BPLUS_(private_pair_query)( pair, pair_cnt, key );
1632 :
1633 5640840 : if( FD_UNLIKELY( pair_idx>=pair_cnt ) ) return -1; /* not found, optimize for found */
1634 :
1635 : /* At this point, pair[ pair_idx ] is the pair to remove. Remove it. */
1636 :
1637 3763473 : pair_cnt--;
1638 6947751 : for( ulong idx=pair_idx; idx<pair_cnt; idx++ ) pair[idx] = pair[idx+1UL];
1639 3763473 : leaf->pair_cnt = pair_cnt; /* FIXME: MOVE BELOW? */
1640 :
1641 : /* At this point, the leaf might be unbalanced but everything else in
1642 : the bplus tree is balanced. */
1643 :
1644 3763473 : if( FD_UNLIKELY( !path_cnt ) ) { /* optimize for big trees */
1645 :
1646 : /* At this point, we removed a pair from the root leaf and the
1647 : leaf's pair_cnt is in [0,pair_max-1] . If there are still pairs
1648 : in the leaf, the bplus tree is still balanced and we are done.
1649 : Otherwise, we release the leaf and make an empty bplus tree
1650 : (which is balanced by definition). */
1651 :
1652 561 : if( FD_LIKELY( pair_cnt ) ) return 0; /* optimize for big trees */
1653 186 : bplus->root_off = 0UL;
1654 186 : bplus->leaf_min_off = 0UL;
1655 186 : bplus->leaf_max_off = 0UL;
1656 186 : BPLUS_(private_leaf_release)( bplus, leaf );
1657 186 : return 0;
1658 :
1659 561 : }
1660 :
1661 : /* At this point, we removed a pair from a non-root leaf and the
1662 : leaf's pair_cnt is in [pair_min-1,pair_max-1]. If there are at
1663 : least pair_min pairs left in the leaf, the bplus tree is still
1664 : balanced and we are done. */
1665 :
1666 3762912 : ulong pair_min = BPLUS_(private_pair_min)();
1667 3762912 : ulong pair_max = BPLUS_(private_pair_max)();
1668 :
1669 3762912 : if( FD_LIKELY( pair_cnt>=pair_min ) ) return 0; /* optimize for big trees */
1670 :
1671 : /* At this point, we removed a pair from a non-root leaf and its
1672 : pair_cnt is pair_min-1. As such, it is not balanced with its
1673 : siblings (leaf must have at least leaf_min-1 siblings that must
1674 : also be leaves with a pair_cnt in [pair_min,pair_max]). Determine
1675 : which sibling to use for rebalancing and how to rebalance with this
1676 : sibling. This sibling will have a pair cnt in [pair_min,pair_max].
1677 :
1678 : Note: Could be more adaptive here (e.g. pick the larger sibling
1679 : when leaf is a middle child). */
1680 :
1681 1660878 : path_cnt--;
1682 1660878 : BPLUS_(private_node_t) * parent = path_node [ path_cnt ];
1683 1660878 : ulong child_idx = path_tree_idx[ path_cnt ];
1684 :
1685 1660878 : ulong sib0_idx = child_idx - (ulong)(child_idx>0UL);
1686 1660878 : ulong sib1_idx = sib0_idx + 1UL;
1687 :
1688 1660878 : ulong sib0_off = parent->tree_off[ sib0_idx ];
1689 1660878 : ulong sib1_off = parent->tree_off[ sib1_idx ];
1690 :
1691 1660878 : BPLUS_(private_leaf_t) * sib0 = BPLUS_(private_leaf)( bplus, sib0_off );
1692 1660878 : BPLUS_(private_leaf_t) * sib1 = BPLUS_(private_leaf)( bplus, sib1_off );
1693 :
1694 1660878 : ulong sib0_pair_cnt = sib0->pair_cnt;
1695 1660878 : ulong sib1_pair_cnt = sib1->pair_cnt;
1696 :
1697 1660878 : ulong reb_pair_cnt = sib0_pair_cnt + sib1_pair_cnt; /* in [pair_max-1,2*pair_max-1]. */
1698 1660878 : if( FD_LIKELY( reb_pair_cnt>=pair_max ) ) {
1699 :
1700 : /* At this point, reb_pair_cnt is in [pair_max,2*pair_max-1].
1701 : Divide these as evenly as possible between sib0 and sib1 and
1702 : update the parent's pivot accordingly. Since we do not remove
1703 : any trees from the parent, this will rebalance the whole bplus
1704 : tree fully and we are done. */
1705 :
1706 697206 : ulong new_sib0_pair_cnt = reb_pair_cnt >> 1;
1707 697206 : ulong new_sib1_pair_cnt = reb_pair_cnt - new_sib0_pair_cnt;
1708 :
1709 697206 : if( new_sib0_pair_cnt>sib0_pair_cnt ) { /* Shift pairs from sib1 into sib0 */
1710 :
1711 167931 : ulong delta = new_sib0_pair_cnt - sib0_pair_cnt;
1712 167931 : memcpy ( sib0->pair + sib0_pair_cnt, sib1->pair, sizeof(BPLUS_PAIR_T)*delta );
1713 167931 : memmove( sib1->pair, sib1->pair + delta, sizeof(BPLUS_PAIR_T)*new_sib1_pair_cnt );
1714 :
1715 529275 : } else { /* Shift pairs from sib0 into sib1 */
1716 :
1717 529275 : ulong delta = sib0_pair_cnt - new_sib0_pair_cnt;
1718 529275 : memmove( sib1->pair + delta, sib1->pair, sizeof(BPLUS_PAIR_T)*sib1_pair_cnt );
1719 529275 : memcpy ( sib1->pair, sib0->pair + new_sib0_pair_cnt, sizeof(BPLUS_PAIR_T)*delta );
1720 :
1721 529275 : }
1722 :
1723 697206 : sib0->pair_cnt = new_sib0_pair_cnt;
1724 697206 : sib1->pair_cnt = new_sib1_pair_cnt;
1725 :
1726 697206 : parent->pivot[sib0_idx] = sib1->pair[0].BPLUS_PAIR_KEY;
1727 697206 : return 0;
1728 697206 : }
1729 :
1730 : /* At this point, reb_pair_cnt is pair_max-1 such that these siblings
1731 : must be merged to restore balance among the leaves. This might
1732 : change the leaf max from sib1 to sib0. */
1733 :
1734 963672 : memcpy( sib0->pair + sib0_pair_cnt, sib1->pair, sizeof(BPLUS_PAIR_T)*sib1_pair_cnt );
1735 963672 : sib0->pair_cnt = reb_pair_cnt;
1736 :
1737 963672 : ulong sib2_off = sib1->next_off;
1738 963672 : sib0->next_off = sib2_off;
1739 :
1740 : /* FIXME: DO BRANCHLESS? */
1741 963672 : if( FD_UNLIKELY( !sib2_off ) ) bplus->leaf_max_off = sib0_off;
1742 961158 : else BPLUS_(private_leaf)( bplus, sib2_off )->prev_off = sib0_off;
1743 :
1744 963672 : BPLUS_(private_child_remove)( parent, sib1_idx );
1745 963672 : BPLUS_(private_leaf_release)( bplus, sib1 );
1746 :
1747 : /* The merge might have unbalance parent among its siblings. If it
1748 : has not, we are done. Otherwise, we rebalance parent among its
1749 : siblings. That might unbalance the grandparent among its siblings.
1750 : And so on along the path potentially all the back to the bplus tree
1751 : root. */
1752 :
1753 963672 : ulong tree_min = BPLUS_(private_tree_min)();
1754 963672 : ulong tree_max = BPLUS_(private_tree_max)();
1755 :
1756 1193622 : while( FD_LIKELY( path_cnt ) ) { /* optimize for big trees */
1757 1189584 : BPLUS_(private_node_t) * child = parent;
1758 :
1759 : /* At this point, because we just removed a tree from child, child's
1760 : tree_cnt is in [tree_min-1,tree_max-1] but everything else is
1761 : balanced. If the child has at least tree_min trees, the bplus
1762 : tree is still balanced. */
1763 :
1764 1189584 : ulong child_tree_cnt = child->tree_cnt;
1765 1189584 : if( FD_LIKELY( child_tree_cnt>=tree_min ) ) return 0; /* optimize for big trees */
1766 :
1767 : /* At this point, child's tree_cnt is tree_min-1. As such, it is
1768 : not balanced with its siblings (child must have at least
1769 : leaf_min-1 siblings that must also be nodes with a tree_cnt in
1770 : [tree_min,tree_max]). Determine which sibling to use for
1771 : rebalancing and how to rebalance.
1772 :
1773 : Note: Could be more adaptive here (e.g. pick the larger sibling
1774 : if a middle child). */
1775 :
1776 410850 : path_cnt--;
1777 410850 : parent = path_node [ path_cnt ];
1778 410850 : child_idx = path_tree_idx[ path_cnt ];
1779 :
1780 410850 : ulong sib0_idx = child_idx - (ulong)(child_idx>0UL);
1781 410850 : ulong sib1_idx = sib0_idx + 1UL;
1782 :
1783 410850 : ulong sib0_off = parent->tree_off[ sib0_idx ];
1784 410850 : ulong sib1_off = parent->tree_off[ sib1_idx ];
1785 :
1786 410850 : BPLUS_(private_node_t) * sib0 = BPLUS_(private_node)( bplus, sib0_off );
1787 410850 : BPLUS_(private_node_t) * sib1 = BPLUS_(private_node)( bplus, sib1_off );
1788 :
1789 410850 : ulong sib0_tree_cnt = sib0->tree_cnt;
1790 410850 : ulong sib1_tree_cnt = sib1->tree_cnt;
1791 :
1792 410850 : ulong reb_tree_cnt = sib0_tree_cnt + sib1_tree_cnt; /* in [tree_max-1,2*tree_max-1]. */
1793 410850 : if( FD_LIKELY( reb_tree_cnt>=tree_max ) ) {
1794 :
1795 : /* At this point, reb_tree_cnt is in [tree_max,2*tree_max-1].
1796 : Divide these as evenly as possible between sib0 and sib1 and
1797 : update the parent's pivot accordingly. Since we do not remove
1798 : any trees from parent, this will rebalance the whole bplus tree
1799 : and we are done. */
1800 :
1801 180900 : ulong new_sib0_tree_cnt = reb_tree_cnt >> 1;
1802 180900 : ulong new_sib1_tree_cnt = reb_tree_cnt - new_sib0_tree_cnt;
1803 :
1804 180900 : if( new_sib0_tree_cnt>sib0_tree_cnt ) { /* Shift leading sib1 trees to trailing sib0 trees */
1805 :
1806 43557 : ulong delta = new_sib0_tree_cnt - sib0_tree_cnt;
1807 43557 : memcpy ( sib0->tree_off + sib0_tree_cnt, sib1->tree_off, sizeof(ulong)*delta );
1808 43557 : memmove( sib1->tree_off, sib1->tree_off + delta, sizeof(ulong)*new_sib1_tree_cnt );
1809 :
1810 : /* Copy parent pivot and leading delta-1 sib1 pivots into sib0. */
1811 :
1812 43557 : sib0->pivot[ sib0_tree_cnt-1UL ] = parent->pivot[ sib0_idx ];
1813 43557 : memcpy( sib0->pivot + sib0_tree_cnt, sib1->pivot, (delta-1UL)*sizeof(BPLUS_KEY_T) );
1814 :
1815 : /* At this point, there is 1 hole in the parent pivots and
1816 : delta-1 holes in the leading sib1 pivots. Copy the next sib1
1817 : pivot to the parent. */
1818 :
1819 43557 : parent->pivot[ sib0_idx ] = sib1->pivot[ delta-1UL ];
1820 :
1821 : /* At this point, there are delta holes in the leading sib1
1822 : pivots. Shift remaining sib1 pivots down delta. */
1823 :
1824 43557 : memmove( sib1->pivot, sib1->pivot+delta, (new_sib1_tree_cnt-1UL)*sizeof(BPLUS_KEY_T) );
1825 :
1826 137343 : } else { /* Shift trailing sib0 trees to leading sib1 trees */
1827 :
1828 137343 : ulong delta = sib0_tree_cnt - new_sib0_tree_cnt;
1829 137343 : memmove( sib1->tree_off + delta, sib1->tree_off, sizeof(ulong)*sib1_tree_cnt );
1830 137343 : memcpy ( sib1->tree_off, sib0->tree_off + new_sib0_tree_cnt, sizeof(ulong)*delta );
1831 :
1832 : /* Shift sib1 pivots up delta. */
1833 :
1834 137343 : memmove( sib1->pivot+delta, sib1->pivot, (sib1_tree_cnt-1UL)*sizeof(BPLUS_KEY_T) );
1835 :
1836 : /* At this point, there are delta holes in the leading sib1
1837 : pivots. Copy trailing delta-1 sib0 pivots and parent pivot
1838 : into sib1. */
1839 :
1840 137343 : memcpy( sib1->pivot, sib0->pivot+new_sib0_tree_cnt, (delta-1UL)*sizeof(BPLUS_KEY_T) );
1841 137343 : sib1->pivot[ delta-1UL ] = parent->pivot[ sib0_idx ];
1842 :
1843 : /* At this point, there is 1 hole in the parent pivot. Copy
1844 : trailing sib0 pivot into parent. */
1845 :
1846 137343 : parent->pivot[ sib0_idx ] = sib0->pivot[ new_sib0_tree_cnt-1UL ];
1847 :
1848 137343 : }
1849 :
1850 180900 : sib0->tree_cnt = new_sib0_tree_cnt;
1851 180900 : sib1->tree_cnt = new_sib1_tree_cnt;
1852 180900 : return 0;
1853 180900 : }
1854 :
1855 : /* At this point, reb_tree_cnt is tree_max-1 such that these
1856 : siblings must be merged to restore balance among siblings. Since
1857 : this might unbalance parent relative to its siblings, we need to
1858 : keep iterating. */
1859 :
1860 229950 : memcpy( sib0->tree_off + sib0_tree_cnt, sib1->tree_off, sizeof(ulong)*sib1_tree_cnt );
1861 :
1862 229950 : sib0->pivot[ sib0_tree_cnt-1UL ] = parent->pivot[ sib0_idx ];
1863 229950 : memcpy( sib0->pivot + sib0_tree_cnt, sib1->pivot, sizeof(BPLUS_KEY_T)*(sib1_tree_cnt-1UL) );
1864 :
1865 229950 : sib0->tree_cnt = reb_tree_cnt;
1866 :
1867 229950 : BPLUS_(private_child_remove)( parent, sib1_idx );
1868 :
1869 229950 : BPLUS_(private_node_release)( bplus, sib1 );
1870 229950 : }
1871 :
1872 : /* At this point, parent is the root node and we just removed a tree
1873 : from it. If parent still has more than 1 tree, the bplus tree is
1874 : balanced and we are done. Otherwise, we make parent's sole child
1875 : the new root and release parent to finish balancing the tree. */
1876 :
1877 4038 : if( FD_LIKELY( parent->tree_cnt>1UL ) ) return 0; /* optimize for big trees */
1878 :
1879 855 : bplus->root_off = parent->tree_off[ 0 ];
1880 855 : BPLUS_(private_node_release)( bplus, parent );
1881 855 : return 0;
1882 4038 : }
1883 :
1884 : BPLUS_(iter_t)
1885 : BPLUS_(private_iter)( BPLUS_(t) const * join,
1886 : BPLUS_KEY_T const * query,
1887 14996460 : int op ) {
1888 14996460 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
1889 :
1890 14996460 : BPLUS_(iter_t) iter;
1891 :
1892 : /* If the bplus is empty or query is NULL, return nul */
1893 :
1894 14996460 : ulong tree_off = bplus->root_off;
1895 14996460 : if( FD_UNLIKELY( (!tree_off) | (!query) ) ) { /* empty, optimize for big trees */
1896 348 : iter.leaf_off = 0UL;
1897 348 : iter.pair_idx = 0UL;
1898 348 : return iter;
1899 348 : }
1900 :
1901 : /* At this point, the bplus is not empty. Find the leaf that might
1902 : contain query. */
1903 :
1904 14996112 : ulong leaf_lo = bplus->leaf_lo;
1905 95611296 : while( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* Optimize for big trees */
1906 80615184 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
1907 80615184 : tree_off = node->tree_off[ BPLUS_(private_node_query)( node->pivot, node->tree_cnt, query ) ];
1908 80615184 : }
1909 14996112 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
1910 :
1911 : /* At this point, pairs in the previous leaf (if any) have keys less
1912 : than query and pairs in the next leaf (if any) have keys greater
1913 : than query. Search the leaf for query. */
1914 :
1915 14996112 : BPLUS_PAIR_T const * pair = leaf->pair;
1916 14996112 : ulong pair_cnt = leaf->pair_cnt;
1917 :
1918 14996112 : ulong i0 = 0UL;
1919 14996112 : ulong i1 = pair_cnt;
1920 :
1921 23399328 : do {
1922 :
1923 : /* At this point, the range [i0,i1) contains at least 1 pair. Pairs
1924 : [0,i0) have keys less than query, pairs [i1,pair_cnt) have keys
1925 : greater than query and we don't know about pairs [i0,i1). Test
1926 : the pair in the middle.
1927 :
1928 : If this pair's key matches query, because all keys are unique, we
1929 : know that pair im is the first pair greater than or equal to
1930 : query and that pair im+1 is the first pair greater than query.
1931 :
1932 : If this pair's key is greater than query, we know all pairs in
1933 : [im,pair_cnt) are greater than query so we update i1 to im.
1934 :
1935 : If this pair's key is less than query, we know that all pairs in
1936 : [0,im+1) are less than query so we update i0 to im+1. */
1937 :
1938 23399328 : ulong im = (i0+i1) >> 1; /* No overflow */
1939 :
1940 23399328 : int cmp = BPLUS_(private_key_cmp)( &pair[im].BPLUS_PAIR_KEY, query );
1941 23399328 : if( FD_UNLIKELY( !cmp ) ) { /* optimize for big trees */
1942 :
1943 : /* At this point, pairs [0,im) have keys less than query, pair im
1944 : key matches query and pairs (im,pair_cnt) are greater than
1945 : query. If:
1946 :
1947 : op==0 (GE): pick i0 == im such that [0,i0) are < query and [i0,pair_cnt) are >= query
1948 : op==1 (GT): pick i0 == im+1 such that [0,i0) are <= query and [i0,pair_cnt) are > query
1949 : op==2 (LE): pick i0 == im+1 such that [0,i0) are <= query and [i0,pair_cnt) are > query
1950 : op==3 (LT): pick i0 == im such that [0,i0) are < query and [i0,pair_cnt) are >= query */
1951 :
1952 7494132 : i0 = im + (ulong)((op==1) | (op==2)); /* compile time */
1953 7494132 : break;
1954 7494132 : }
1955 15905196 : i0 = fd_ulong_if( cmp>0, i0, im+1UL );
1956 15905196 : i1 = fd_ulong_if( cmp>0, im, i1 );
1957 :
1958 15905196 : } while( FD_LIKELY( i1-i0 ) ); /* optimize for big trees */
1959 :
1960 : /* At this point:
1961 :
1962 : op==0 (GE): pairs [i0,pair_cnt) have keys greater than or equal to query
1963 : op==1 (GT): pairs [i0,pair_cnt) have keys greater than query
1964 : op==2 (LE): pairs [0,i0) have keys less than or equal to query
1965 : op==3 (LT): pairs [0,i0) have keys less than query */
1966 :
1967 14996112 : if( op<=1 ) { /* compile time */
1968 :
1969 7498056 : if( FD_UNLIKELY( i0==pair_cnt ) ) { /* optimize for big trees */
1970 :
1971 : /* At this point:
1972 :
1973 : op==0 (GE): all pairs have keys less than query and pairs in any next leaf have keys greater than query
1974 : op==1 (GT): all pairs have keys less than or equal to query and pairs in any next leaf have keys greater than query
1975 :
1976 : position iterator at first pair in next leaf (or nul if this is
1977 : the max leaf). */
1978 :
1979 4491171 : tree_off = leaf->next_off;
1980 4491171 : i0 = 0UL;
1981 4491171 : }
1982 :
1983 7498056 : } else {
1984 :
1985 7498056 : if( FD_UNLIKELY( i0==0UL ) ) { /* optimize for big trees */
1986 :
1987 : /* At this point:
1988 :
1989 : op==2 (LE): all pairs have keys greater than query and pairs in any prev leaf have keys less than query
1990 : op==3 (LT): all pairs have keys greater than or equal to query and pairs in any prev leaf have keys less than query
1991 :
1992 : position iterator at last pair in previous leaf (or nul if this
1993 : is the min leaf). */
1994 :
1995 742896 : tree_off = leaf->prev_off;
1996 742896 : i0 = FD_UNLIKELY( !tree_off ) ? 1UL : BPLUS_(private_leaf_const)( bplus, tree_off )->pair_cnt;
1997 742896 : }
1998 7498056 : i0--;
1999 :
2000 7498056 : }
2001 :
2002 14996112 : iter.leaf_off = tree_off;
2003 14996112 : iter.pair_idx = i0;
2004 14996112 : return iter;
2005 14996460 : }
2006 :
2007 : int
2008 1873254 : BPLUS_(verify)( BPLUS_(t) const * join ) {
2009 :
2010 55111332036 : # define BPLUS_TEST(c) do { \
2011 52928495214 : if( FD_UNLIKELY( !(c) ) ) { \
2012 0 : FD_LOG_WARNING(( "FAIL: %s", #c )); \
2013 0 : return -1; \
2014 0 : } \
2015 52928495214 : } while(0)
2016 :
2017 : /* Verify join */
2018 :
2019 1873254 : BPLUS_TEST( join );
2020 :
2021 1873254 : BPLUS_(private_t) const * bplus = BPLUS_(private_const)( join );
2022 :
2023 1873254 : BPLUS_TEST( fd_ulong_is_aligned( (ulong)bplus, BPLUS_ALIGN ) );
2024 :
2025 : /* Verify header */
2026 :
2027 1873254 : BPLUS_TEST( bplus->magic==BPLUS_MAGIC );
2028 :
2029 1873254 : ulong node_max = bplus->node_max;
2030 1873254 : ulong leaf_max = bplus->leaf_max;
2031 :
2032 1873254 : BPLUS_TEST( node_max<=BPLUS_(private_node_max_max)() );
2033 1873254 : BPLUS_TEST( leaf_max<=BPLUS_(private_leaf_max_max)() );
2034 :
2035 1873254 : ulong node_lo = bplus->node_lo;
2036 1873254 : ulong leaf_lo = bplus->leaf_lo;
2037 :
2038 1873254 : BPLUS_TEST( node_lo==fd_ulong_align_up( sizeof( BPLUS_(private_t) ), BPLUS_NODE_ALIGN ) );
2039 1873254 : BPLUS_TEST( leaf_lo==fd_ulong_align_up( node_lo + node_max*sizeof( BPLUS_(private_node_t) ), BPLUS_LEAF_ALIGN ) );
2040 :
2041 1873254 : ulong node_hi = node_lo + node_max*sizeof( BPLUS_(private_node_t) );
2042 1873254 : ulong leaf_hi = leaf_lo + leaf_max*sizeof( BPLUS_(private_leaf_t) );
2043 :
2044 1873254 : ulong root_off = bplus->root_off;
2045 1873254 : ulong leaf_min_off = bplus->leaf_min_off;
2046 1873254 : ulong leaf_max_off = bplus->leaf_max_off;
2047 :
2048 1873254 : if( FD_LIKELY( root_off ) ) {
2049 :
2050 1873170 : BPLUS_TEST( node_lo<=root_off ); BPLUS_TEST( root_off<leaf_hi );
2051 1873170 : BPLUS_TEST( fd_ulong_is_aligned( root_off, fd_ulong_if( !BPLUS_(private_is_leaf)( root_off, leaf_lo ),
2052 1873170 : BPLUS_NODE_ALIGN, BPLUS_LEAF_ALIGN ) ) );
2053 :
2054 1873170 : BPLUS_TEST( leaf_lo<=leaf_min_off ); BPLUS_TEST( leaf_min_off<leaf_hi );
2055 1873170 : BPLUS_TEST( fd_ulong_is_aligned( leaf_min_off, BPLUS_LEAF_ALIGN ) );
2056 :
2057 1873170 : BPLUS_TEST( leaf_lo<=leaf_max_off ); BPLUS_TEST( leaf_max_off<leaf_hi );
2058 1873170 : BPLUS_TEST( fd_ulong_is_aligned( leaf_max_off, BPLUS_LEAF_ALIGN ) );
2059 :
2060 1873170 : } else {
2061 :
2062 84 : BPLUS_TEST( !leaf_min_off );
2063 84 : BPLUS_TEST( !leaf_max_off );
2064 :
2065 84 : }
2066 :
2067 1873254 : ulong node_rem = bplus->node_max;
2068 1873254 : ulong leaf_rem = bplus->leaf_max;
2069 :
2070 : /* Verify node pool */
2071 :
2072 1873254 : ulong node_off = bplus->node_pool_off;
2073 1336552995 : while( FD_LIKELY( node_off ) ) {
2074 1334679741 : BPLUS_TEST( node_rem ); node_rem--;
2075 1334679741 : BPLUS_TEST( node_lo<=node_off ); BPLUS_TEST( node_off<node_hi );
2076 1334679741 : BPLUS_TEST( fd_ulong_is_aligned( node_off, BPLUS_NODE_ALIGN ) );
2077 1334679741 : node_off = BPLUS_(private_node_const)( bplus, node_off )->tree_off[0];
2078 1334679741 : }
2079 :
2080 : /* Verify leaf pool */
2081 :
2082 1873254 : ulong leaf_off = bplus->leaf_pool_off;
2083 2242739487 : while( FD_LIKELY( leaf_off ) ) {
2084 2240866233 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2085 2240866233 : BPLUS_TEST( leaf_lo<=leaf_off ); BPLUS_TEST( leaf_off<leaf_hi );
2086 2240866233 : BPLUS_TEST( fd_ulong_is_aligned( leaf_off, BPLUS_LEAF_ALIGN ) );
2087 2240866233 : leaf_off = BPLUS_(private_leaf_const)( bplus, leaf_off )->next_off;
2088 2240866233 : }
2089 :
2090 : /* Verify the actual tree */
2091 :
2092 1873254 : ulong leaf_cnt = leaf_rem;
2093 :
2094 1873254 : if( FD_LIKELY( root_off ) ) { /* optimize for big trees */
2095 :
2096 : /* At this point, the tree is not empty */
2097 :
2098 1873170 : ulong tree_min = BPLUS_(private_tree_min)();
2099 1873170 : ulong tree_max = BPLUS_(private_tree_max)();
2100 :
2101 1873170 : ulong pair_min = BPLUS_(private_pair_min)();
2102 1873170 : ulong pair_max = BPLUS_(private_pair_max)();
2103 :
2104 1873170 : ulong stack_tree_off [ 128 ];
2105 1873170 : ulong stack_subtree_idx[ 128 ];
2106 1873170 : BPLUS_KEY_T const * stack_key_lo [ 128 ];
2107 1873170 : BPLUS_KEY_T const * stack_key_hi [ 128 ];
2108 1873170 : ulong stack_cnt = 0UL;
2109 1873170 : ulong stack_max = 128UL;
2110 :
2111 1873170 : ulong tree_off = root_off;
2112 1873170 : ulong subtree_idx = 0UL;
2113 1873170 : BPLUS_KEY_T const * key_lo = NULL;
2114 1873170 : BPLUS_KEY_T const * key_hi = NULL;
2115 :
2116 3776521611 : for(;;) {
2117 :
2118 : /* At this point, we are still validating the tree rooted at
2119 : tree_off and this tree should contain only keys in
2120 : [key_lo,key_hi). key_{lo,hi}==NULL indicates key_{lo,hi} is
2121 : {-inf,+inf}.
2122 :
2123 : If tree is a node, we've validated all of tree's subtrees
2124 : [0,subtree_idx). subtree_idx==0 indicates this is the first
2125 : time we've visited this node.
2126 :
2127 : If tree is a leaf, as we only visit each leaf exactly once,
2128 : subtree_idx will be zero (and otherwise ignored). */
2129 :
2130 3776521611 : if( FD_LIKELY( !BPLUS_(private_is_leaf)( tree_off, leaf_lo ) ) ) { /* tree is a node */
2131 :
2132 : /* If this is the first time visiting this node, validate it */
2133 :
2134 2180963652 : if( FD_UNLIKELY( !subtree_idx ) ) {
2135 :
2136 : /* Validate no loops */
2137 :
2138 587278863 : BPLUS_TEST( node_rem ); node_rem--;
2139 :
2140 : /* Validate the node pointer */
2141 :
2142 587278863 : BPLUS_TEST( node_lo<=tree_off ); BPLUS_TEST( tree_off<node_hi );
2143 587278863 : BPLUS_TEST( fd_ulong_is_aligned( tree_off, BPLUS_NODE_ALIGN ) );
2144 :
2145 587278863 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
2146 :
2147 587278863 : BPLUS_KEY_T const * subtree_pivot = node->pivot;
2148 587278863 : ulong const * subtree_off = node->tree_off;
2149 587278863 : ulong subtree_cnt = node->tree_cnt;
2150 :
2151 : /* Validate the node tree count */
2152 :
2153 587278863 : BPLUS_TEST( fd_ulong_if( tree_off!=root_off, tree_min, 2UL )<=subtree_cnt );
2154 587278863 : BPLUS_TEST( subtree_cnt<=tree_max );
2155 :
2156 : /* Validate the node tree offsets */
2157 :
2158 587278863 : int is_leaf = BPLUS_(private_is_leaf)( subtree_off[0], leaf_lo );
2159 :
2160 587278863 : ulong lo = fd_ulong_if( is_leaf, leaf_lo, node_lo );
2161 587278863 : ulong hi = fd_ulong_if( is_leaf, leaf_hi, node_hi );
2162 587278863 : ulong align = fd_ulong_if( is_leaf, BPLUS_LEAF_ALIGN, BPLUS_NODE_ALIGN );
2163 :
2164 2768242515 : for( ulong idx=0UL; idx<subtree_cnt; idx++ ) {
2165 2180963652 : ulong off = subtree_off[ idx ];
2166 2180963652 : BPLUS_TEST( lo<=off ); BPLUS_TEST( off<hi );
2167 2180963652 : BPLUS_TEST( fd_ulong_is_aligned( off, align ) );
2168 2180963652 : }
2169 :
2170 : /* Validate the node pivots */
2171 :
2172 587278863 : if( FD_LIKELY( key_lo ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( key_lo, &subtree_pivot[0] )<0 );
2173 :
2174 1593684789 : for( ulong idx=1UL; idx<subtree_cnt-1UL; idx++ )
2175 1006405926 : BPLUS_TEST( BPLUS_(private_key_cmp)( &subtree_pivot[idx-1UL], &subtree_pivot[idx] )<0 );
2176 :
2177 587278863 : if( FD_LIKELY( key_hi ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( &subtree_pivot[subtree_cnt-2UL], key_hi )<0 );
2178 587278863 : }
2179 :
2180 : /* At this point, tree_off is a bplus global offset of a
2181 : verified node (verified either just now or on a previous
2182 : iteration). If subtree_idx isn't the last subtree, push
2183 : subtree_idx+1 onto the stack for a later iteration. */
2184 :
2185 2180963652 : BPLUS_(private_node_t) const * node = BPLUS_(private_node_const)( bplus, tree_off );
2186 :
2187 2180963652 : BPLUS_KEY_T const * subtree_pivot = node->pivot;
2188 2180963652 : ulong const * subtree_off = node->tree_off;
2189 2180963652 : ulong subtree_cnt = node->tree_cnt;
2190 :
2191 2180963652 : if( FD_LIKELY( (subtree_idx+1UL)<subtree_cnt ) ) {
2192 1593684789 : BPLUS_TEST( stack_cnt<stack_max );
2193 1593684789 : stack_tree_off [ stack_cnt ] = tree_off;
2194 1593684789 : stack_subtree_idx[ stack_cnt ] = subtree_idx+1UL;
2195 1593684789 : stack_key_lo [ stack_cnt ] = key_lo;
2196 1593684789 : stack_key_hi [ stack_cnt ] = key_hi;
2197 1593684789 : stack_cnt++;
2198 1593684789 : }
2199 :
2200 : /* And recurse into subtree_idx for the next iteration. Note
2201 : this node's key_lo is subtree_idx 0's key_lo and this node's
2202 : key_hi is subtree_idx tree_cnt-1's key_hi. */
2203 :
2204 2180963652 : /**/ tree_off = subtree_off [ subtree_idx ];
2205 2180963652 : if( FD_LIKELY( subtree_idx>0UL ) ) key_lo = &subtree_pivot[ subtree_idx-1UL ];
2206 2180963652 : if( FD_LIKELY( subtree_idx<subtree_cnt-1UL ) ) key_hi = &subtree_pivot[ subtree_idx ];
2207 2180963652 : subtree_idx = 0UL;
2208 2180963652 : continue;
2209 2180963652 : }
2210 :
2211 : /* At this point, tree is a leaf. Validate no loops. */
2212 :
2213 1595557959 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2214 :
2215 : /* Validate the leaf pointer */
2216 :
2217 1595557959 : BPLUS_TEST( leaf_lo<=tree_off ); BPLUS_TEST( tree_off<leaf_hi );
2218 1595557959 : BPLUS_TEST( fd_ulong_is_aligned( tree_off, BPLUS_LEAF_ALIGN ) );
2219 :
2220 1595557959 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, tree_off );
2221 :
2222 1595557959 : BPLUS_PAIR_T const * pair = leaf->pair;
2223 1595557959 : ulong pair_cnt = leaf->pair_cnt;
2224 :
2225 : /* Validate the leaf pair count */
2226 :
2227 1595557959 : BPLUS_TEST( fd_ulong_if( tree_off!=root_off, pair_min, 1UL )<=pair_cnt );
2228 1595557959 : BPLUS_TEST( pair_cnt<=pair_max );
2229 :
2230 : /* Validate the leaf pairs */
2231 :
2232 1595557959 : if( FD_LIKELY( key_lo ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( key_lo, &pair[0].BPLUS_PAIR_KEY )<=0 );
2233 :
2234 4031130432 : for( ulong idx=1UL; idx<pair_cnt; idx++ )
2235 2435572473 : BPLUS_TEST( BPLUS_(private_key_cmp)( &pair[idx-1UL].BPLUS_PAIR_KEY, &pair[idx].BPLUS_PAIR_KEY )<0 );
2236 :
2237 1595557959 : if( FD_LIKELY( key_hi ) ) BPLUS_TEST( BPLUS_(private_key_cmp)( &pair[ pair_cnt-1UL ].BPLUS_PAIR_KEY, key_hi )<0 );
2238 :
2239 : /* (Note that we validate the leaf ordered iterator below.) */
2240 :
2241 : /* If no more work to do, abort. Otherwise, get the next node to
2242 : process. */
2243 :
2244 1595557959 : if( FD_UNLIKELY( !stack_cnt ) ) break;
2245 1593684789 : stack_cnt--;
2246 1593684789 : tree_off = stack_tree_off [ stack_cnt ];
2247 1593684789 : subtree_idx = stack_subtree_idx[ stack_cnt ];
2248 1593684789 : key_lo = stack_key_lo [ stack_cnt ];
2249 1593684789 : key_hi = stack_key_hi [ stack_cnt ];
2250 1593684789 : }
2251 1873170 : }
2252 :
2253 : /* Validate all nodes and leaves touched */
2254 :
2255 1873254 : BPLUS_TEST( !node_rem );
2256 1873254 : BPLUS_TEST( !leaf_rem );
2257 :
2258 : /* Validate leaf iteration */
2259 :
2260 1873254 : leaf_rem = leaf_cnt;
2261 :
2262 1873254 : ulong leaf_prev_off = 0UL;
2263 1873254 : /**/ leaf_off = bplus->leaf_min_off;
2264 1597431213 : while( leaf_off ) { /* Validates leaf->next_off for last iteration */
2265 :
2266 : /* Validate no loops */
2267 :
2268 1595557959 : BPLUS_TEST( leaf_rem ); leaf_rem--;
2269 :
2270 : /* Validate forward iteration (validates bplus->leaf_min_off first
2271 : iteration, validates leaf->next_off interior iterations) */
2272 :
2273 1595557959 : BPLUS_TEST( leaf_lo<=leaf_off ); BPLUS_TEST( leaf_off<leaf_hi );
2274 1595557959 : BPLUS_TEST( fd_ulong_is_aligned( leaf_off, BPLUS_LEAF_ALIGN ) );
2275 1595557959 : BPLUS_(private_leaf_t) const * leaf = BPLUS_(private_leaf_const)( bplus, leaf_off );
2276 :
2277 : /* Validate reverse iteration (validates leaf->prev_off,
2278 : bplus->leaf_max_off validated below) */
2279 :
2280 1595557959 : BPLUS_TEST( leaf->prev_off==leaf_prev_off );
2281 :
2282 : /* Validate ordered leaves */
2283 :
2284 1595557959 : if( FD_LIKELY( leaf_prev_off ) ) {
2285 1593684789 : BPLUS_(private_leaf_t) const * prev = BPLUS_(private_leaf_const)( bplus, leaf_prev_off );
2286 1593684789 : BPLUS_TEST( BPLUS_(private_key_cmp)( &prev->pair[ prev->pair_cnt-1UL ].BPLUS_PAIR_KEY, &leaf->pair[ 0 ].BPLUS_PAIR_KEY )<0 );
2287 1593684789 : }
2288 :
2289 1595557959 : leaf_prev_off = leaf_off;
2290 1595557959 : leaf_off = leaf->next_off;
2291 1595557959 : }
2292 :
2293 1873254 : BPLUS_TEST( bplus->leaf_max_off==leaf_prev_off ); /* Validates bplus->leaf_max_off */
2294 1873254 : BPLUS_TEST( !leaf_rem ); /* All leaves in tree covered */
2295 :
2296 1873254 : # undef BPLUS_TEST
2297 :
2298 1873254 : return 0;
2299 1873254 : }
2300 :
2301 : #endif
2302 :
2303 : #undef BPLUS_STATIC
2304 : #undef BPLUS_
2305 :
2306 : #undef BPLUS_IMPL_STYLE
2307 : #undef BPLUS_MAGIC
2308 : #undef BPLUS_LEAF_ALIGN
2309 : #undef BPLUS_NODE_ALIGN
2310 : #undef BPLUS_ALIGN
2311 : #undef BPLUS_TREE_MAX
2312 : #undef BPLUS_NODE_MAX
2313 : #undef BPLUS_PAIR_T
2314 : #undef BPLUS_KEY_T
2315 : #undef BPLUS_NAME
|