Line data Source code
1 : /* Declare ultra high performance priority queues of bounded run-time
2 : size. Typical usage:
3 :
4 : struct event {
5 : ... app stuff
6 : long timeout; // Technically "PRQ_TIMEOUT_T PRQ_TIMEOUT;"
7 : ... app stuff
8 : };
9 :
10 : typedef struct event event_t;
11 :
12 : #define PRQ_NAME eventq
13 : #define PRQ_T event_t
14 : #include "util/tmpl/fd_prq.c"
15 :
16 : will declare the following static inline APIs as a header only style
17 : library in the compilation unit:
18 :
19 : // align/footprint - Return the alignment/footprint required for a
20 : // memory region to be used as eventq that can hold up to max
21 : // events.
22 : //
23 : // new - Format a memory region pointed to by shmem into a eventq.
24 : // Assumes shmem points to a region with the required alignment and
25 : // footprint not in use by anything else. Caller is not joined on
26 : // return. Returns shmem.
27 : //
28 : // join - Join an eventq. Assumes sheventq points at a region
29 : // formatted as an eventq. Returns a pointer to the eventq's heap.
30 : // The events on the heap are indexed [0,cnt). heap[0] is the
31 : // current min event on the heap (that is, heap[0] is not strictly
32 : // after any other event on the heap). The remaining events on the
33 : // heap can be in any order. Operations on the heap can reorganize
34 : // the indexing in arbitrary ways (heap[0] will always be the min
35 : // event when there is 1 or more events on the heap though). THIS
36 : // IS NOT JUST A SIMPLE CAST OF SHEVENTQ.
37 : //
38 : // leave - Leave an eventq. Assumes heap points to the eventq's
39 : // heap. Returns a pointer to the shared memory region the join.
40 : // THIS IS NOT JUST A SIMPLE CAST OF HEAP
41 : //
42 : // delete - Unformat a memory region used as an eventq. Assumes
43 : // sheventq points to a formatted region with no current joins.
44 : // Returns a pointer to the unformatted memory region.
45 :
46 : ulong eventq_align ( void );
47 : ulong eventq_footprint( ulong max );
48 : void * eventq_new ( void * shmem, ulong max );
49 : event_t * eventq_join ( void * sheventq ); // not just a cast of sheventq
50 : void * eventq_leave ( event_t * heap ); // not just a cast of eventq
51 : void * eventq_delete ( void * sheventq );
52 :
53 : // cnt returns the current number events in the heap
54 : // max returns the maximum number events in the heap
55 :
56 : ulong eventq_cnt ( event_t const * heap ); // In [0,max]
57 : ulong eventq_max ( event_t const * heap );
58 :
59 : // insert inserts the event_t pointed to by event into the event
60 : // queue. Caller promises there is room in the eventq for event.
61 : // The eventq has no interest in event on return (its contents are
62 : // copied into the event queue). Fast O(lg cnt). Returns heap.
63 : //
64 : // remove_min is optimized version of remove( heap, 0UL ). Caller
65 : // promises there at least one event on the heap. Fast O(lg cnt).
66 : // Returns heap.
67 : //
68 : // remove removes the event_t currently at heap[idx] from the heap.
69 : // idx should be in [0,cnt) where cnt is the number of events on the
70 : // heap at call start (as such, the caller also promises there are
71 : // at least idx+1 events on the help). Fast O(lg cnt). Returns
72 : // heap.
73 : //
74 : // remove_all removes all events from heap. Fast O(1). Returns
75 : // heap.
76 :
77 : event_t * eventq_insert ( event_t * heap, event_t const * event );
78 : event_t * eventq_remove_min( event_t * heap );
79 : event_t * eventq_remove ( event_t * heap, ulong idx );
80 : event_t * eventq_remove_all( event_t * heap );
81 :
82 : // Note none of this APIs do any input argument checking as they are
83 : // meant to be used in ultra high performance contexts. Thus they
84 : // require the user to be careful. There is enough functionality
85 : // here though to trivially wrap this in variants that test user
86 : // arguments for sanity.
87 :
88 : // Really large event_t are not recommended due to excess implied
89 : // data motion under the hood. Cases with
90 : // sizeof(event_t)==alignof(event_t)==32 are particular good
91 : // performance practically.
92 :
93 : You can do this as often as you like in a compilation unit to get
94 : different types of priority queues. Since it is all static inline,
95 : it is fine to do this in a header too. Additional options to fine
96 : tune this are detailed below. */
97 :
98 : #include "../bits/fd_bits.h"
99 : #include <stddef.h>
100 :
101 : #ifndef offsetof
102 : # define offsetof(TYPE,MEMB) ((ulong)((TYPE*)0)->MEMB)
103 : #endif
104 :
105 : #ifndef PRQ_NAME
106 : #error "Define PRQ_NAME"
107 : #endif
108 :
109 : /* A PRQ_T should be something something reasonable to shallow copy with
110 : the fields described above. PRQ_T that are 32-bytes in size with
111 : 32-byte alignment have particularly good cache Feng Shui. */
112 :
113 : #ifndef PRQ_T
114 : #error "Define PRQ_T"
115 : #endif
116 :
117 : /* Setting PRQ_EXPLICIT_TIMEOUT to 0 allows the user to use a comparison
118 : function to compare elements in the PRQ without having a single explicit
119 : timeout field. */
120 : #ifndef PRQ_EXPLICIT_TIMEOUT
121 : #define PRQ_EXPLICIT_TIMEOUT 1
122 : #endif
123 :
124 : #if PRQ_EXPLICIT_TIMEOUT
125 : /* PRQ_TIMEOUT allows the user to specify the name of the timeout
126 : field used for the PRQ. Defaults to timeout. */
127 :
128 : #ifndef PRQ_TIMEOUT
129 72792 : #define PRQ_TIMEOUT timeout
130 : #endif
131 :
132 : /* PRQ_TIMEOUT_T allows the user to specify the type of the timeout
133 : field. Defaults to long. */
134 :
135 : #ifndef PRQ_TIMEOUT_T
136 72792 : #define PRQ_TIMEOUT_T long
137 : #endif
138 :
139 : /* PRQ_TIMEOUT_AFTER returns 1 if the timeout x is strictly after the
140 : timeout y. Can be changed to create max-queues instead of min queues
141 : for example (or use orderable but non-integral types for timeouts). */
142 :
143 : #ifndef PRQ_TIMEOUT_AFTER
144 716568 : #define PRQ_TIMEOUT_AFTER(x,y) ((x)>(y))
145 : #endif
146 :
147 : #else /* PRQ_EXPLICIT_TIMEOUT */
148 :
149 : #ifndef PRQ_AFTER
150 : #define PRQ_AFTER(x,y) ((x)>(y))
151 : #endif
152 :
153 : #endif
154 :
155 : /* The PRQ_TMP_* are meant to allow users of PRQ to do extreme low level
156 : optimizations for a particular implementation (e.g. loading an PRQ_T
157 : directly into vector registers and operating on the PRQ_T in the
158 : registers directly). It is okay for these macros to do multiple
159 : evaluations of their arguments and be more than one single linguistic
160 : expression. */
161 :
162 : /* PRQ_TMP_LD declare some ideally register temporaries (t_0,t_1,...)
163 : and loads the PRQ_T at p into them. */
164 :
165 : #ifndef PRQ_TMP_LD
166 27011799 : #define PRQ_TMP_LD(t,p) PRQ_T t = (p)[0]
167 : #endif
168 :
169 : /* PRQ_TMP_ST stores an PRQ_T in temporaries t (see PRQ_TMP_LD) to the
170 : PRQ_T pointed to by p. */
171 :
172 : #ifndef PRQ_TMP_ST
173 123924 : #define PRQ_TMP_ST(p,t) (p)[0] = (t)
174 : #endif
175 :
176 : /* PRQ_TMP_TIMEOUT returns the timeout associated with the PRQ_T in
177 : temporaries t. */
178 :
179 : #ifndef PRQ_TMP_TIMEOUT
180 : # if PRQ_EXPLICIT_TIMEOUT
181 26218842 : # define PRQ_TMP_TIMEOUT(t) ((t).PRQ_TIMEOUT)
182 : # endif
183 : #endif
184 :
185 : /* PRQ_TMP_AFTER returns 1UL if timeout associated with the PRQ_T in
186 : temporaries x is strictly after the PRQ_T in temporaries in y and 0UL
187 : otherwise. */
188 :
189 : #ifndef PRQ_TMP_AFTER
190 : # if PRQ_EXPLICIT_TIMEOUT
191 616500 : # define PRQ_TMP_AFTER(x,y) ((ulong)PRQ_TIMEOUT_AFTER( PRQ_TMP_TIMEOUT(x), PRQ_TMP_TIMEOUT(y) ))
192 : # else
193 28134 : # define PRQ_TMP_AFTER(x,y) ((ulong)PRQ_AFTER( x, y ))
194 : # endif
195 : #endif
196 :
197 : /* PRQ_TMP_CMOV does "if(c) x = y" where x and y are PRQ_T located in
198 : temporaries. */
199 :
200 : #ifndef PRQ_TMP_CMOV
201 644634 : #define PRQ_TMP_CMOV(c,x,y) if( (c) ) (x) = (y)
202 : #endif
203 :
204 : /* Implementation *****************************************************/
205 :
206 78814182 : #define PRQ_(n) FD_EXPAND_THEN_CONCAT3(PRQ_NAME,_,n)
207 :
208 : struct PRQ_(private) {
209 : ulong max;
210 : ulong cnt;
211 : PRQ_T heap[1]; /* note that is half cache line aligned if PRQ_T is 32 byte and aligned 32 byte */
212 : /* max+1 PRQ_T follow here */
213 : };
214 :
215 : typedef struct PRQ_(private) PRQ_(private_t);
216 :
217 : FD_PROTOTYPES_BEGIN
218 :
219 : /* Private APIs *******************************************************/
220 :
221 : /* private_from_heap return a pointer to the prq_private given a pointer
222 : to the prq's heap. private_from_heap_const also provided for
223 : const-correctness purposes. */
224 :
225 : FD_FN_CONST static inline PRQ_(private_t) *
226 26200137 : PRQ_(private_from_heap)( PRQ_T * heap ) {
227 26200137 : ulong ofs = offsetof( PRQ_(private_t), heap );
228 26200137 : return (PRQ_(private_t) *)( (ulong)heap - (ulong)(ofs) );
229 26200137 : }
230 :
231 : FD_FN_CONST static inline PRQ_(private_t) const *
232 81810 : PRQ_(private_from_heap_const)( PRQ_T const * heap ) {
233 81810 : ulong ofs = offsetof( PRQ_(private_t), heap );
234 81810 : return (PRQ_(private_t) const *)( (ulong)heap - (ulong)(ofs) );
235 81810 : }
236 :
237 : /* fill_hole_up fills the hole in heap with event and then bubbles it
238 : up toward the root until the heap property is satisfied. This
239 : requires event's timeout to be less than or equal to the timeouts of
240 : the children in the sub-heap rooted at hole. This is trivially true
241 : if hole is a leaf (i.e. no children) and also true if the new_event's
242 : timeout is less than or equal to the hole's parent timeout (i.e. the
243 : parent has a timeout less than or equal to its children so the
244 : new_event does too). */
245 :
246 : FD_FN_UNUSED static void /* Work around -Winline */
247 : PRQ_(private_fill_hole_up)( PRQ_T * heap, /* Heap, indexed 0:hole+1 */
248 : ulong hole, /* Location of the hole to fill */
249 13107396 : PRQ_T const * event ) { /* Event to fill the hole with */
250 :
251 13107396 : PRQ_TMP_LD( tmp_event, event ); /* Load the event to fill the hole with */
252 : #if PRQ_EXPLICIT_TIMEOUT
253 13104291 : PRQ_TIMEOUT_T event_timeout = PRQ_TMP_TIMEOUT( tmp_event );
254 : #endif
255 13137009 : while( hole ) { /* If the hole to fill has a parent */
256 13120725 : ulong parent = (hole-1UL) >> 1; /* Load the parent */
257 13120725 : PRQ_TMP_LD( tmp_parent, heap + parent );
258 : #if PRQ_EXPLICIT_TIMEOUT
259 13114551 : PRQ_TIMEOUT_T parent_timeout = PRQ_TMP_TIMEOUT( tmp_parent );
260 13114551 : if( FD_LIKELY( !PRQ_TIMEOUT_AFTER( parent_timeout, event_timeout ) ) )
261 : #else
262 6174 : if( FD_LIKELY( !PRQ_TMP_AFTER( tmp_parent, tmp_event ) ) )
263 3072 : #endif
264 13091112 : break; /* If the parent at least as old as the event, ... */
265 29613 : PRQ_TMP_ST( heap + hole, tmp_parent ); /* Otherwise, fill the hole with the hole's parent */
266 29613 : hole = parent; /* and recurse on the created hole at parent */
267 29613 : }
268 :
269 13107396 : PRQ_TMP_ST( heap + hole, tmp_event ); /* ... fill the hole with the event to schedule */
270 13107396 : }
271 :
272 : /* fill_hole_dn fills the hole in heap with the last event on the heap
273 : and bubbles it down toward the leaves until the heap property is
274 : restored. This requires that the hole to fill is the root of the
275 : heap (i.e. a remove-min) or the hole's parent timeout is less than or
276 : equal to the timeout of the heap's last event (i.e. as might happen
277 : in a cancel). */
278 :
279 : FD_FN_UNUSED static void /* Work around -Winline */
280 : PRQ_(private_fill_hole_dn)( PRQ_T * heap, /* Heap, half cache line aligned for best perf (assuming 32-byte PRQ_T),
281 : heap[max] and heap[max+1] are dummy slots */
282 : ulong hole, /* Location of the hole to fill, in [0,cnt] */
283 139056 : ulong cnt ) { /* Location of the last heap event == heap event count not including the hole,
284 : cnt is in [0,max) (if there is is a hole, cnt can't be max) */
285 :
286 : /* Note that this branch isn't strictly necessary. If hole==cnt,
287 : heap[hole] will be detected as childless and will be filled by
288 : itself in a nop. */
289 :
290 139056 : if( FD_UNLIKELY( hole>=cnt ) ) return; /* No hole to fill (empty heap or hole at the end such that heap is still contiguous) */
291 :
292 : /* At this point, there is a hole to fill (hole<cnt, hole in [0,cnt)
293 : and cnt in [1,max). Fill the hole by removing the last event and
294 : reinserting it. This will keep events contiguously packed while
295 : satisfying the heap property. */
296 :
297 139056 : PRQ_TMP_LD( tmp_reinsert, heap+cnt );
298 :
299 139044 : ulong speculate_max = cnt | 1UL; /* First odd number >= cnt, in [cnt,cnt+1] */
300 :
301 322317 : for(;;) {
302 :
303 : /* Determine where the first child of the hole is located. We clamp
304 : this to the first odd number >= cnt such speculative loads done
305 : on a hole that has no children don't go far past the end of the
306 : heap and only hit one cache line. As such, child is an odd
307 : number in [1,cnt+1] and, as cnt<=max-1, child must be in [1,max].
308 : This in turn implies child1 is even and in [2,max+1]. */
309 :
310 322317 : ulong child = fd_ulong_min( 2UL*hole+1UL, speculate_max );
311 322317 : ulong child1 = child+1UL;
312 :
313 : /* Speculatively load the two children of this hole. These loads
314 : will always hit the same cache line (the heap is half cache line
315 : aligned and child is odd) and these loads are safe even if this
316 : hole has fewer children due to the above clamp and dummy slots. */
317 :
318 322317 : PRQ_TMP_LD( tmp_child, heap+child );
319 322317 : PRQ_TMP_LD( tmp_child1, heap+child1 );
320 :
321 : /* If there is a second child and the second child is before the
322 : first child, the hole should be filled with either the second
323 : child or the event to reinsert. Otherwise, the hole should be
324 : filled with the first child (if any) or the event to reinsert.
325 :
326 : Sigh - compiler won't do a SSE register cmov branchlessly and
327 : also often undoes the corresponding child index update cmov.
328 : This leaves one or more algorithmically unpredictable branches in
329 : a critical place ... so we DIY. */
330 :
331 322317 : ulong use_child1 = ((ulong)(child1<cnt)) & PRQ_TMP_AFTER( tmp_child, tmp_child1 );
332 322317 : PRQ_TMP_CMOV( use_child1, tmp_child, tmp_child1 );
333 322317 : child += use_child1;
334 :
335 : /* If there was a child selected above and the event to reinsert is
336 : strictly after the child, fill the hole with the child (making a
337 : new hole). Otherwise, fill the hole with the event to reinsert
338 : and we are done.
339 :
340 : Sigh - see above DIY sigh. */
341 :
342 322317 : ulong use_reinsert = ((ulong)(child>=cnt)) | PRQ_TMP_AFTER( tmp_child, tmp_reinsert );
343 322317 : PRQ_TMP_CMOV( use_reinsert, tmp_child, tmp_reinsert );
344 322317 : PRQ_TMP_ST( heap+hole, tmp_child );
345 322317 : if( use_reinsert ) break; /* Unclear branch prob */
346 183273 : hole = child;
347 183273 : }
348 139044 : }
349 :
350 : /* fill_hole has the exact same semantics as fill_hold_dn but only
351 : requires the ancestors and descendents of the hole obey the heap
352 : property (it will bubble the heap in the appropriate direction). */
353 :
354 : static inline void
355 : PRQ_(private_fill_hole)( PRQ_T * heap,
356 : ulong hole,
357 13085124 : ulong cnt ) {
358 :
359 : /* If the heap is still compact given the location of the hole,
360 : nothing to do. Otherwise, we are going to fill the hole with the
361 : last event on the heap. Note that this is not strictly necessary. */
362 :
363 13085124 : if( FD_UNLIKELY( hole>=cnt ) ) return;
364 :
365 : /* If the hole has a parent that is after the last event on the heap,
366 : we need bubble the hole up to find where to reinsert the last
367 : event. Otherwise, we need to bubble down. Branch prob here is not
368 : obvious and probably application dependent. */
369 : #if PRQ_EXPLICIT_TIMEOUT
370 129945 : if( hole && PRQ_TIMEOUT_AFTER( heap[ (hole-1UL)>>1 ].PRQ_TIMEOUT, heap[ cnt ].PRQ_TIMEOUT ) )
371 : #else
372 1533 : if( hole && PRQ_AFTER ( heap[ (hole-1UL)>>1 ] , heap[ cnt ] ) )
373 33 : #endif
374 102 : PRQ_(private_fill_hole_up)( heap, hole, &heap[cnt] );
375 131376 : else
376 131376 : PRQ_(private_fill_hole_dn)( heap, hole, cnt );
377 1533 : }
378 :
379 : /* Public APIS ********************************************************/
380 :
381 2709 : static inline ulong PRQ_(align)( void ) { return alignof(PRQ_(private_t)); }
382 :
383 : static inline ulong
384 1356 : PRQ_(footprint)( ulong max ) {
385 : /* 2UL is for the dummy slots */
386 1356 : return fd_ulong_align_up( fd_ulong_align_up( 16UL, alignof(PRQ_T) ) + sizeof(PRQ_T)*(max+2UL), alignof(PRQ_(private_t)) );
387 1356 : }
388 :
389 : static inline void *
390 : PRQ_(new)( void * mem,
391 531 : ulong max ) {
392 : /* FIXME: VALIDATE MEM AND MAX? */
393 531 : PRQ_(private_t) * prq = (PRQ_(private_t) *)mem;
394 531 : prq->cnt = 0UL;
395 531 : prq->max = max;
396 531 : return (void *)prq;
397 531 : }
398 :
399 531 : static inline PRQ_T * PRQ_(join )( void * prq ) { return ((PRQ_(private_t) *)prq)->heap; }
400 9 : static inline void * PRQ_(leave )( PRQ_T * heap ) { return (void *)PRQ_(private_from_heap)( heap ); }
401 9 : static inline void * PRQ_(delete)( void * prq ) { return prq; }
402 :
403 43218 : static inline ulong PRQ_(cnt)( PRQ_T const * heap ) { return PRQ_(private_from_heap_const)( heap )->cnt; }
404 38592 : static inline ulong PRQ_(max)( PRQ_T const * heap ) { return PRQ_(private_from_heap_const)( heap )->max; }
405 :
406 : static inline PRQ_T *
407 : PRQ_(insert)( PRQ_T * heap,
408 13107294 : PRQ_T const * event ) {
409 13107294 : PRQ_(private_t) * prq = PRQ_(private_from_heap)( heap );
410 : /* FIXME: HANDHOLDING OPTIONS TO TEST FOR OVERFLOW */
411 13107294 : ulong hole = prq->cnt++;
412 13107294 : PRQ_(private_fill_hole_up)( heap, hole, event );
413 13107294 : return heap;
414 13107294 : }
415 :
416 : static inline PRQ_T *
417 7680 : PRQ_(remove_min)( PRQ_T * heap ) {
418 7680 : PRQ_(private_t) * prq = PRQ_(private_from_heap)( heap );
419 : /* FIXME: HANDHOLDING OPTIONS TO TEST FOR UNDERFLOW */
420 7680 : ulong cnt = --prq->cnt;
421 7680 : PRQ_(private_fill_hole_dn)( heap, 0UL, cnt );
422 7680 : return heap;
423 7680 : }
424 :
425 : static inline PRQ_T *
426 : PRQ_(remove)( PRQ_T * heap,
427 13085124 : ulong idx ) {
428 13085124 : PRQ_(private_t) * prq = PRQ_(private_from_heap)( heap );
429 : /* FIXME: HANDHOLDING OPTIONS TO TEST FOR UNDERFLOW */
430 13085124 : ulong cnt = --prq->cnt;
431 13085124 : PRQ_(private_fill_hole)( heap, idx, cnt );
432 13085124 : return heap;
433 13085124 : }
434 :
435 : static inline PRQ_T *
436 30 : PRQ_(remove_all)( PRQ_T * heap ) {
437 30 : PRQ_(private_t) * prq = PRQ_(private_from_heap)( heap );
438 30 : prq->cnt = 0UL;
439 30 : return heap;
440 30 : }
441 :
442 : FD_PROTOTYPES_END
443 :
444 : #undef PRQ_
445 :
446 : /* End implementation *************************************************/
447 :
448 : #undef PRQ_TMP_CMOV
449 : #undef PRQ_TMP_AFTER
450 : #undef PRQ_TMP_TIMEOUT
451 : #undef PRQ_TMP_ST
452 : #undef PRQ_TMP_LD
453 :
454 : #undef PRQ_AFTER
455 : #undef PRQ_TIMEOUT_AFTER
456 : #undef PRQ_TIMEOUT_T
457 : #undef PRQ_TIMEOUT
458 : #undef PRQ_EXPLICIT_TIMEOUT
459 : #undef PRQ_T
460 : #undef PRQ_NAME
461 :
|