Line data Source code
1 : /* Declares a family of functions useful for single threaded sorting of
2 : POD types in high performance contexts. Example usage:
3 :
4 : #define SORT_NAME sort_double_descend
5 : #define SORT_KEY_T double
6 : #define SORT_BEFORE(a,b) (a)>(b)
7 : #include "util/tmpl/fd_sort.c"
8 :
9 : will create the following API for use in the local compile unit:
10 :
11 : // Sort key[i] for i in [0,cnt) stable in place in a best / average
12 : // worst case of O(N) / O(N^2) / O(N^2) operations.
13 :
14 : double *
15 : sort_double_descend_insert( double * key,
16 : ulong cnt );
17 :
18 : // Return 1 if cnt is a sensible value for the stable sorting APIs
19 : // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow).
20 :
21 : int sort_double_descend_stable_cnt_valid( ulong cnt );
22 :
23 : // Return the alignment and footprint required for a scratch region
24 : // adequate for sorting up to cnt elements. The results will be
25 : // standard allocator and standard declaration friendly. (E.g. a
26 : // declaration "double scratch[ cnt ];" will be fine as a scratch
27 : // region.)
28 :
29 : ulong sort_double_descend_stable_scratch_align ( void );
30 : ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
31 :
32 : // Sort key[i] for i in [0,cnt) stable in best / average / worst
33 : // case of O(N lg N) / O(N lg N) / O(N lg N) operations. Scratch
34 : // is a scratch workspace of suitable alignment and footprint.
35 : // Returns where the sorted values ended up. Will be at either key
36 : // or (double *)scratch.
37 :
38 : double *
39 : sort_double_descend_stable_fast( double * key,
40 : ulong cnt,
41 : void * scratch );
42 :
43 : // Same as above but does additional copying if necessary such that
44 : // the final result ends up in key. Returns key.
45 :
46 : double *
47 : sort_double_descend_stable( double * key,
48 : ulong cnt,
49 : void * scratch );
50 :
51 : // Sort key[i] for i in [0,cnt) inplace in best / average / worst
52 : // case of O(N lg N) / O(N lg N) / O(N^2) operations. Returns key.
53 :
54 : double *
55 : sort_double_inplace( double * key,
56 : ulong cnt );
57 :
58 : // Partially sort key[i] such that key[rnk] holds rnk and return
59 : // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
60 : // operations. Returns key. Assumes key is non-NULL, cnt is valid
61 : // and positive and rnk is in [0,cnt)
62 :
63 : double *
64 : sort_double_select( double * key,
65 : ulong cnt,
66 : ulong rnk );
67 :
68 : // Binary search for element equal-or-greater than key in a sorted
69 : // list. Return value is an index into sorted in [0,cnt]. If
70 : // return value > 0 then it is the largest index where
71 : // SORT_BEFORE(sorted[index-1],query)==1. Returns 0 if
72 : // SORT_BEFORE(query,elem)==1 for all elements in sorted. Assumes
73 : // SORT_BEFORE defines a total order over sorted, and each element
74 : // in sorted is less-or-equal than its successor. (Returns
75 : // incorrect result otherwise!) Robust if cnt==0 (in that case,
76 : // returns 0 and ignores pointer to sorted).
77 :
78 : ulong
79 : sort_double_search_geq( double const * sorted,
80 : ulong cnt,
81 : double query );
82 :
83 : It is fine to include this template multiple times in a compilation
84 : unit. Just provide the specification before each inclusion. Various
85 : additional options to tune the methods are described below. */
86 :
87 : #include "../bits/fd_bits.h"
88 :
89 : /* SORT_NAME gives the name of the function to declare (and the base
90 : name of auxiliary and/or variant functions). */
91 :
92 : #ifndef SORT_NAME
93 : #error "SORT_NAME must be defined"
94 : #endif
95 :
96 : /* SORT_KEY_T gives the POD datatype to sort. */
97 :
98 : #ifndef SORT_KEY_T
99 : #error "SORT_KEY_T must be defined"
100 : #endif
101 :
102 : /* SORT_IDX_T gives the data type used to index the arrays */
103 :
104 : #ifndef SORT_IDX_T
105 1677647813 : #define SORT_IDX_T ulong
106 : #endif
107 :
108 : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true. SAFETY TIP:
109 : This is not a 3-way comparison function! */
110 :
111 : #ifndef SORT_BEFORE
112 9136829 : #define SORT_BEFORE(a,b) (a)<(b)
113 : #endif
114 :
115 : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
116 : partition size where insertion sort will be used. Should be at least
117 : 2 (and probably larger than one might expect to get optimal
118 : performance). */
119 :
120 : #ifndef SORT_MERGE_THRESH
121 6340896 : #define SORT_MERGE_THRESH 32
122 : #endif
123 :
124 : #ifndef SORT_QUICK_THRESH
125 : #define SORT_QUICK_THRESH 32
126 : #endif
127 :
128 : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
129 : Roughly, say 1 for floating point keys (see quick method for
130 : details). */
131 :
132 : #ifndef SORT_QUICK_ORDER_STYLE
133 : #define SORT_QUICK_ORDER_STYLE 0
134 : #endif
135 :
136 : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
137 : non-deterministic branch cost to avoid no-op swaps. This is useful
138 : if the key_t has a large memory footprint such that the no-op is
139 : actually a larger cost than a branch mispredict. */
140 :
141 : #ifndef SORT_QUICK_SWAP_MINIMIZE
142 : #define SORT_QUICK_SWAP_MINIMIZE 0
143 : #endif
144 :
145 : /* 0 - local use only
146 : 1 - library header declaration
147 : 2 - library implementation */
148 :
149 : #ifndef SORT_IMPL_STYLE
150 : #define SORT_IMPL_STYLE 0
151 : #endif
152 :
153 : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
154 :
155 : #ifndef SORT_IDX_IF
156 575944911 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
157 : #endif
158 :
159 : /**********************************************************************/
160 :
161 13679036 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
162 :
163 : #if SORT_IMPL_STYLE==1 /* need prototypes */
164 :
165 : SORT_KEY_T *
166 : SORT_(private_merge)( SORT_KEY_T * key,
167 : SORT_IDX_T cnt,
168 : SORT_KEY_T * tmp );
169 :
170 : SORT_KEY_T *
171 : SORT_(private_quick)( SORT_KEY_T * key,
172 : SORT_IDX_T cnt );
173 :
174 : SORT_KEY_T *
175 : SORT_(private_select)( SORT_KEY_T * key,
176 : SORT_IDX_T cnt,
177 : SORT_IDX_T rnk );
178 :
179 : #else /* need implementations */
180 :
181 : #if SORT_IMPL_STYLE==0 /* local only */
182 : #define SORT_IMPL_STATIC static
183 : #else
184 : #define SORT_IMPL_STATIC
185 : #endif
186 :
187 : static SORT_KEY_T *
188 : SORT_(insert)( SORT_KEY_T * key,
189 5148260 : SORT_IDX_T cnt ) {
190 118480107 : for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
191 113331847 : SORT_KEY_T key_i = key[i];
192 113331847 : SORT_IDX_T j = i;
193 935724643 : while( j ) {
194 931918110 : SORT_IDX_T k = j - ((SORT_IDX_T)1);
195 931918110 : SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
196 822392796 : key[j] = key_j;
197 822392796 : j = k;
198 822392796 : }
199 113331847 : key[j] = key_i;
200 113331847 : }
201 5148260 : return key;
202 5148260 : }
203 :
204 : /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
205 :
206 : SORT_IMPL_STATIC SORT_KEY_T *
207 : SORT_(private_merge)( SORT_KEY_T * key,
208 : SORT_IDX_T cnt,
209 6340896 : SORT_KEY_T * tmp ) {
210 :
211 : /* If below threshold, use insertion sort */
212 :
213 6340896 : if( cnt<=((SORT_IDX_T)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, cnt );
214 :
215 : /* Otherwise, break input in half and sort the halves */
216 :
217 2659872 : SORT_KEY_T * key_left = key;
218 2659872 : SORT_KEY_T * tmp_left = tmp;
219 2659872 : SORT_IDX_T cnt_left = cnt >> 1;
220 2659872 : SORT_KEY_T * out_left = SORT_(private_merge)( key_left, cnt_left, tmp_left );
221 :
222 2659872 : SORT_KEY_T * key_right = key + cnt_left;
223 2659872 : SORT_KEY_T * tmp_right = tmp + cnt_left;
224 2659872 : SORT_IDX_T cnt_right = cnt - cnt_left;
225 2659872 : SORT_KEY_T * out_right = SORT_(private_merge)( key_right, cnt_right, tmp_right );
226 :
227 : /* Merge out_left / out_right */
228 :
229 2659872 : if( out_left==key ) key = tmp; /* If out_left overlaps with key, merge into tmp. Otherwise, merge into key */
230 :
231 : /* Note that out_left does not overlap with the location for merge
232 : output at this point. out_right might overlap (with the right
233 : half) with the location for merge output but this will still work
234 : in that case. */
235 :
236 2659872 : SORT_IDX_T i = ((SORT_IDX_T)0);
237 2659872 : SORT_IDX_T j = ((SORT_IDX_T)0);
238 2659872 : SORT_IDX_T k = ((SORT_IDX_T)0);
239 2659872 : # if 1 /* Minimal C language operations */
240 144635166 : for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
241 144635166 : if( SORT_BEFORE( out_right[k], out_left[j] ) ) {
242 52344009 : key[i++] = out_right[k++];
243 52344009 : if( k>=cnt_right ) {
244 401367 : do key[i++] = out_left [j++]; while( j<cnt_left ); /* append left stragglers (at least one) */
245 170760 : break;
246 170760 : }
247 92291157 : } else {
248 92291157 : key[i++] = out_left [j++];
249 92291157 : if( j>=cnt_left ) {
250 41702559 : do key[i++] = out_right[k++]; while( k<cnt_right ); /* append right stragglers (at least one) */
251 2489112 : break;
252 2489112 : }
253 92291157 : }
254 144635166 : }
255 : # else /* Branch free variant (Pyth investigations suggested the above is better) */
256 : for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
257 : SORT_KEY_T out_j = out_left [j];
258 : SORT_KEY_T out_k = out_right[k];
259 : int from_right = (SORT_BEFORE( out_k, out_j ));
260 : key[i++] = from_right ? out_k : out_j; FD_COMPILER_FORGET( from_right );
261 : j += (SORT_IDX_T)(from_right^1); FD_COMPILER_FORGET( from_right );
262 : k += (SORT_IDX_T) from_right;
263 : int c = (j<cnt_left) & (k<cnt_right); FD_COMPILER_FORGET( c ); /* prevent compiler from branch nesting the compares */
264 : if( !c ) break;
265 : }
266 : for( ; j<cnt_left; j++ ) key[i++] = out_left [j];
267 : for( ; k<cnt_right; k++ ) key[i++] = out_right[k];
268 : # endif
269 :
270 2659872 : return key;
271 6340896 : }
272 :
273 : /* This uses a dual pivot quick sort for better theoretical and
274 : practical mojo. */
275 :
276 : SORT_IMPL_STATIC SORT_KEY_T *
277 : SORT_(private_quick)( SORT_KEY_T * key,
278 1998360 : SORT_IDX_T cnt ) {
279 1998360 : SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
280 1998360 : ulong stack_cnt = 0UL;
281 :
282 1998360 : stack[ stack_cnt++ ] = (SORT_IDX_T)0;
283 1998360 : stack[ stack_cnt++ ] = cnt;
284 6451476 : for(;;) {
285 :
286 : /* Pop the next partition to process */
287 :
288 6451476 : if( !stack_cnt ) return key; /* All done */
289 4453116 : SORT_IDX_T h = stack[ --stack_cnt ];
290 4453116 : SORT_IDX_T l = stack[ --stack_cnt ];
291 :
292 : /* [l,h) is the partition we are sorting. If this partition is
293 : small enough, sort it via insertion sort. */
294 :
295 4453116 : SORT_IDX_T n = h-l;
296 4453116 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
297 1195770 : SORT_(insert)( key+l, n );
298 1195770 : continue;
299 1195770 : }
300 :
301 : /* This partition is too large to insertion sort. Pick two pivots,
302 : sort the partition into a left, center and right partition and
303 : then recursively sort those partitions. We initially use a
304 : simple choice of pivots that is ideal for nearly sorted data (in
305 : either direction) with a little extra sampling to improve the
306 : partitioning for randomly ordered data. There is a possibility
307 : that this deterministic choice of pivots will not succeed in
308 : making two or more non-empty partitions; we detect and correct
309 : that below. */
310 :
311 : /* Initial pivot selection */
312 :
313 3257346 : SORT_KEY_T p1;
314 3257346 : SORT_KEY_T p2;
315 3257346 : do {
316 3257346 : SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
317 3257346 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
318 3257346 : SORT_KEY_T p0 = key[ l ];
319 3257346 : /**/ p1 = key[ l + n3 ];
320 3257346 : /**/ p2 = key[ h1 - n3 ];
321 3257346 : SORT_KEY_T p3 = key[ h1 ];
322 3257346 : # if SORT_QUICK_ORDER_STYLE==0
323 : /* Generates better code for integral types (branchless via stack) */
324 3257346 : SORT_KEY_T _k[2]; ulong _c;
325 16286730 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
326 : # else
327 : /* Generates much better code for floating point types (branchless
328 : via min / max floating point instructions) */
329 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
330 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
331 : # endif
332 :
333 3257346 : ORDER(p0,p2); ORDER(p1,p3);
334 3257346 : ORDER(p0,p1); ORDER(p2,p3);
335 3257346 : ORDER(p1,p2);
336 3257346 : # undef ORDER
337 3257346 : } while(0);
338 :
339 3645678 : retry: /* Three way partitioning (silly language restriction) */;
340 3645678 : SORT_IDX_T i = l;
341 3645678 : SORT_IDX_T j = l;
342 3645678 : SORT_IDX_T k = h-(SORT_IDX_T)1;
343 509786277 : do { /* Note [j,k] is non-empty (look ma ... no branches!) */
344 :
345 : /* At this point, p1 <= p2 and:
346 : - keys [l,i) are before p1 (left partition)
347 : - keys [i,j) are in [p1,p2] (center partition)
348 : - keys [j,k] are not yet partitioned (region is non-empty)
349 : - keys (k,h) are after p2 (right partition)
350 : Decide where key j should be moved. */
351 :
352 509786277 : SORT_KEY_T kj = key[j];
353 :
354 509786277 : int to_left = (SORT_BEFORE( kj, p1 ));
355 509786277 : int to_right = (SORT_BEFORE( p2, kj ));
356 509786277 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
357 :
358 : # if SORT_QUICK_SWAP_MINIMIZE
359 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
360 : # else
361 : /**/ key[j] = key[m]; key[m] = kj;
362 509786277 : # endif
363 :
364 509786277 : i += (SORT_IDX_T) to_left;
365 509786277 : j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
366 509786277 : k -= (SORT_IDX_T) to_right;
367 509786277 : } while( j<=k );
368 :
369 : /* Schedule recursion */
370 :
371 3645678 : if( FD_UNLIKELY( (j-i)==n ) ) {
372 :
373 : /* All the keys ended up in the center partition. If p1<p2, we
374 : had an unlucky choice of pivots such that p1==min(key[i]) and
375 : p2==max(key[i]) for i in [l,h). Since we picked the keys
376 : deterministically above, we would have the same bad luck the
377 : next time we processed this partition. But, because there are
378 : at least two distinct elements in the partition, there is a
379 : choice of pivots that will make a non-trivial partition, so we
380 : need to redo this partition with different pivots.
381 :
382 : Randomly picking a new pivot pair in [l,h) has probability
383 : between ~50% and ~100%. The worst case is half the keys in
384 : the partition equal to p1 and the other half equal to p2; the
385 : best case exactly one key is equal to p1 / p2 and all other
386 : keys are in (p1,p2] / [p1,p2).
387 :
388 : The optimal handling of the worst case though is just to set
389 : p1=p2 (p2=p1). This will pull the keys equal to p1 (p2) into
390 : the left (right) partition and the remaining keys into the
391 : center partition; the right (left) partition will be empty.
392 : Thus, this is guaranteed to yield a non-trivial partition of
393 : the center partition but may not yield as load balanced set of
394 : partitions as random for the next iteration. We go with the
395 : deterministic option for simplicity below. */
396 :
397 1550499 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
398 :
399 : /* p1==p2 here ... all keys in this partition are the same
400 : We don't need to recurse further in this partition. */
401 :
402 2095179 : } else {
403 :
404 : /* Between [1,n-1] keys ended up in the center partition such that
405 : at least 1 key ended up in another partition. We push all
406 : three partitions onto the partition stack for recursion. The
407 : order of the pushes is important to bound the maximum stack
408 : usage. Below, we push the center partition followed by the
409 : larger of the left and right partitions then followed by the
410 : smaller of the left and right partitions (such that we process
411 : the smallest next iteration). For this, the same O(log_2 cnt)
412 : max recursion depth applies as quicksort even this does a three
413 : way partitioning. That is, let fl/fc/fr be the fraction of
414 : keys in the left, center, right partitions. At this point, fl
415 : is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
416 : The smaller of the left or right partition is what process next
417 : and, as the smaller of the left or right, it fraction of the
418 : keys is at most 0.5(1-fc)<=0.5. We do push up to two
419 : partitions onto the stack (four SORT_IDX_T) each level of
420 : recursion though instead of one like normal quick sort though. */
421 :
422 2095179 : int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
423 2095179 : SORT_IDX_T l_larger = SORT_IDX_IF( left_larger, l, j );
424 2095179 : SORT_IDX_T h_larger = SORT_IDX_IF( left_larger, i, h );
425 2095179 : SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
426 2095179 : SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
427 :
428 : /* Immediately process empty partitions */
429 2095179 : ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
430 :
431 : /* Immediately process partitions where all keys are the same */
432 2095179 : ulong push_center = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
433 :
434 : /* Guaranteed non-empty (larger of left or right have at least 1 key) */
435 2095179 : ulong push_larger = 1UL;
436 :
437 2095179 : stack[ stack_cnt ] = l_larger; stack_cnt += push_larger;
438 2095179 : stack[ stack_cnt ] = h_larger; stack_cnt += push_larger;
439 2095179 : stack[ stack_cnt ] = i; stack_cnt += push_center;
440 2095179 : stack[ stack_cnt ] = j; stack_cnt += push_center;
441 2095179 : stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
442 2095179 : stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
443 2095179 : }
444 3645678 : }
445 : /* never get here */
446 1998360 : }
447 :
448 : SORT_IMPL_STATIC SORT_KEY_T *
449 : SORT_(private_select)( SORT_KEY_T * key,
450 : SORT_IDX_T cnt,
451 231504 : SORT_IDX_T rnk ) {
452 231504 : SORT_IDX_T l = (SORT_IDX_T)0;
453 231504 : SORT_IDX_T h = cnt;
454 748713 : for(;;) {
455 :
456 : /* The partition [l,h) contains rnk. If this partition is small
457 : enough, sort it via insertion sort. */
458 :
459 748713 : SORT_IDX_T n = h-l;
460 748713 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
461 :
462 : /* This partition is too large to insertion sort. Pick two pivots,
463 : sort the partition into a left, center and right partition and
464 : then recursive on the partition holding rnk. We initially use a
465 : simple choice of pivots that is ideal for nearly sorted data (in
466 : either direction) with a little extra sampling to improve the
467 : partitioning for randomly ordered data. There is a possibility
468 : that this deterministic choice of pivots will not succeed in
469 : making two or more non-empty partitions; we detect and correct
470 : that below. */
471 :
472 : /* Initial pivot selection */
473 :
474 517231 : SORT_KEY_T p1;
475 517231 : SORT_KEY_T p2;
476 517231 : do {
477 517231 : SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
478 517231 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
479 517231 : SORT_KEY_T p0 = key[ l ];
480 517231 : /**/ p1 = key[ l + n3 ];
481 517231 : /**/ p2 = key[ h1 - n3 ];
482 517231 : SORT_KEY_T p3 = key[ h1 ];
483 517231 : # if SORT_QUICK_ORDER_STYLE==0
484 : /* Generates better code for integral types (branchless via stack) */
485 517231 : SORT_KEY_T _k[2]; ulong _c;
486 2586155 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
487 : # else
488 : /* Generates much better code for floating point types (branchless
489 : via min / max floating point instructions) */
490 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
491 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
492 : # endif
493 :
494 517231 : ORDER(p0,p2); ORDER(p1,p3);
495 517231 : ORDER(p0,p1); ORDER(p2,p3);
496 517231 : ORDER(p1,p2);
497 517231 : # undef ORDER
498 517231 : } while(0);
499 :
500 517238 : retry: /* Three way partitioning (silly language restriction) */;
501 517238 : SORT_IDX_T i = l;
502 517238 : SORT_IDX_T j = l;
503 517238 : SORT_IDX_T k = h-(SORT_IDX_T)1;
504 57777918 : do { /* Note [j,k] is non-empty (look ma ... no branches!) */
505 :
506 : /* At this point, p1 <= p2 and:
507 : - keys [l,i) are before p1 (left partition)
508 : - keys [i,j) are in [p1,p2] (center partition)
509 : - keys [j,k] are not yet partitioned (region is non-empty)
510 : - keys (k,h) are after p2 (right partition)
511 : Decide where key j should be moved. */
512 :
513 57777918 : SORT_KEY_T kj = key[j];
514 :
515 57777918 : int to_left = (SORT_BEFORE( kj, p1 ));
516 57777918 : int to_right = (SORT_BEFORE( p2, kj ));
517 57777918 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
518 :
519 : # if SORT_QUICK_SWAP_MINIMIZE
520 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
521 : # else
522 : /**/ key[j] = key[m]; key[m] = kj;
523 57777918 : # endif
524 :
525 57777918 : i += (SORT_IDX_T) to_left;
526 57777918 : j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
527 57777918 : k -= (SORT_IDX_T) to_right;
528 57777918 : } while( j<=k );
529 :
530 517238 : if( FD_UNLIKELY( (j-i)==n ) ) {
531 :
532 : /* All the keys ended up in the center partition. If p1<p2, we
533 : had an unlucky choice of pivots such that p1==min(key[i]) and
534 : p2==max(key[i]) for i in [l,h). Since we picked the keys
535 : deterministically above, we would have the same bad luck the
536 : next time we processed this partition. But, because there are
537 : at least two distinct elements in the partition, there is a
538 : choice of pivots that will make a non-trivial partition, so we
539 : need to redo this partition with different pivots.
540 :
541 : Randomly picking a new pivot pair in [l,h) has probability
542 : between ~50% and ~100%. The worst case is half the keys in
543 : the partition equal to p1 and the other half equal to p2; the
544 : best case exactly one key is equal to p1 / p2 and all other
545 : keys are in (p1,p2] / [p1,p2).
546 :
547 : The optimal handling of the worst case though is just to set
548 : p1=p2 (p2=p1). This will pull the keys equal to p1 (p2) into
549 : the left (right) partition and the remaining keys into the
550 : center partition; the right (left) partition will be empty.
551 : Thus, this is guaranteed to yield a non-trivial partition of
552 : the center partition but may not yield as load balanced set of
553 : partitions as random for the next iteration. We go with the
554 : deterministic option for simplicity below. */
555 :
556 29 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
557 :
558 : /* p1==p2 here ... all keys in this partition are the same. We
559 : don't need to recurse further in this partition as all keys
560 : would do here. */
561 :
562 22 : return key;
563 29 : }
564 :
565 : /* At this point:
566 : - [l,i) is the left partition,
567 : - [i,j) is the center partition
568 : - [j,h) is the right partition
569 : Between [1,n-1] keys ended up in the center partition such that
570 : at least 1 key ended up in another partition. Recurse on the
571 : partition that contains rnk. As this is typically used in median
572 : finding, we assume that the center partition is the most likely
573 : in the below selection (this can also be done branchlessly). */
574 :
575 517209 : SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
576 517209 : if( FD_UNLIKELY( rnk< i ) ) l_next = l, h_next = i;
577 517209 : if( FD_UNLIKELY( j <=rnk ) ) l_next = j, h_next = h;
578 517209 : l = l_next; h = h_next;
579 517209 : }
580 : /* never get here */
581 231504 : }
582 :
583 : #undef SORT_IMPL_STATIC
584 :
585 : #endif
586 :
587 : #if SORT_IMPL_STYLE==0 || SORT_IMPL_STYLE==1 /* need interface */
588 :
589 0 : FD_FN_CONST static inline int SORT_(stable_cnt_valid)( SORT_IDX_T cnt ) {
590 0 : /* Written funny for supporting different signed idx types without
591 0 : generating compiler warnings */
592 0 : SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
593 0 : (ulong)(SORT_IDX_T)((~0UL)>>1) ); /* ==SORT_IDX_MAX as ulong */
594 0 : return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
595 0 : }
596 :
597 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align) ( void ) { return alignof(SORT_KEY_T); }
598 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
599 :
600 : static inline SORT_KEY_T *
601 : SORT_(stable_fast)( SORT_KEY_T * key,
602 : SORT_IDX_T cnt,
603 510576 : void * scratch ) {
604 510576 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
605 510576 : return SORT_(private_merge)( key, cnt, tmp );
606 510576 : }
607 :
608 : FD_FN_UNUSED static SORT_KEY_T * /* Work around -Winline */
609 : SORT_(stable)( SORT_KEY_T * key,
610 : SORT_IDX_T cnt,
611 510576 : void * scratch ) {
612 510576 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
613 7771710 : if( SORT_(private_merge)( key, cnt, tmp )==tmp ) for( SORT_IDX_T i=((SORT_IDX_T)0); i<cnt; i++ ) key[i] = tmp[i];
614 510576 : return key;
615 510576 : }
616 :
617 : static inline SORT_KEY_T *
618 : SORT_(inplace)( SORT_KEY_T * key,
619 1998360 : SORT_IDX_T cnt ) {
620 1998360 : return SORT_(private_quick)( key, cnt );
621 1998360 : }
622 :
623 : static inline SORT_KEY_T *
624 : SORT_(select)( SORT_KEY_T * key,
625 : SORT_IDX_T cnt,
626 231504 : SORT_IDX_T rnk ) {
627 231504 : return SORT_(private_select)( key, cnt, rnk );
628 231504 : }
629 :
630 : static inline SORT_IDX_T
631 : SORT_(search_geq)( SORT_KEY_T const * sorted,
632 : SORT_IDX_T cnt,
633 6153 : SORT_KEY_T query ) {
634 6153 : SORT_IDX_T j = (SORT_IDX_T)0;
635 6153 : SORT_IDX_T n = (SORT_IDX_T)cnt;
636 67653 : while( n>((SORT_IDX_T)1) ) {
637 : /* At this point the j corresponding to k is in [j,j+n) */
638 61500 : SORT_IDX_T j_left = j; SORT_IDX_T n_left = n >> 1;
639 61500 : SORT_IDX_T j_right = j + n_left; SORT_IDX_T n_right = n - n_left;
640 61500 : int go_left = SORT_BEFORE( query, sorted[ j_right ] );
641 61500 : j = go_left ? j_left : j_right; /* branchless */
642 61500 : n = go_left ? n_left : n_right; /* branchless */
643 61500 : }
644 6153 : return j;
645 6153 : }
646 :
647 : #endif
648 :
649 : #undef SORT_
650 :
651 : #undef SORT_IDX_IF
652 : #undef SORT_IMPL_STYLE
653 : #undef SORT_QUICK_SWAP_MINIMIZE
654 : #undef SORT_QUICK_ORDER_STYLE
655 : #undef SORT_QUICK_THRESH
656 : #undef SORT_MERGE_THRESH
657 : #undef SORT_BEFORE
658 : #undef SORT_IDX_T
659 : #undef SORT_KEY_T
660 : #undef SORT_NAME
661 :
|