Line data Source code
1 : /* Declares a family of functions useful for single threaded sorting of
2 : POD types in high performance contexts. Example usage:
3 :
4 : #define SORT_NAME sort_double_descend
5 : #define SORT_KEY_T double
6 : #define SORT_BEFORE(a,b) (a)>(b)
7 : #include "util/tmpl/fd_sort.c"
8 :
9 : will create the following API for use in the local compile unit:
10 :
11 : // Sort key[i] for i in [0,cnt) stable in place in a best / average
12 : // worst case of O(N) / O(N^2) / O(N^2) operations.
13 :
14 : double *
15 : sort_double_descend_insert( double * key,
16 : ulong cnt );
17 :
18 : // Return 1 if cnt is a sensible value for the sorting APIs
19 : // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow,
20 : // 3*cnt*ceil(log_3(cnt)) will not overflow, etc).
21 :
22 : int sort_double_descend_cnt_valid( ulong cnt );
23 :
24 : // Return the alignment and footprint required for a scratch region
25 : // adequate for sorting up to cnt elements. The results will be
26 : // standard allocator and standard declaration friendly. (E.g. a
27 : // declaration "double scratch[ cnt ];" will be fine as a scratch
28 : // region.)
29 :
30 : ulong sort_double_descend_stable_scratch_align ( void );
31 : ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
32 :
33 : // Sort key[i] for i in [0,cnt) stable in best / average / worst
34 : // case of O(N lg N) / O(N lg N) / O(N lg N) operations. Scratch
35 : // is a scratch workspace of suitable alignment and footprint.
36 : // Returns where the sorted values ended up. Will be at either key
37 : // or (double *)scratch.
38 :
39 : double *
40 : sort_double_descend_stable_fast( double * key,
41 : ulong cnt,
42 : void * scratch );
43 :
44 : // Same as above but does additional copying if necessary such that
45 : // the final result ends up in key. Returns key.
46 :
47 : double *
48 : sort_double_descend_stable( double * key,
49 : ulong cnt,
50 : void * scratch );
51 :
52 : // Sort key[i] for i in [0,cnt) inplace in best / average / worst
53 : // case of O(N lg N) / O(N lg N) / O(N^2) operations. Returns key.
54 :
55 : double *
56 : sort_double_descend_inplace( double * key,
57 : ulong cnt );
58 :
59 : // Partially sort key[i] such that key[rnk] holds rnk and return
60 : // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
61 : // operations. Returns key. Assumes key is non-NULL, cnt is valid
62 : // and positive and rnk is in [0,cnt)
63 :
64 : double *
65 : sort_double_descend_select( double * key,
66 : ulong cnt,
67 : ulong rnk );
68 :
69 : // Given a sorted array sorted indexed [0,cnt),
70 : // sort_double_descend_split returns an index i in [0,cnt] such all
71 : // entries in [0,i) are BEFORE query and all entries [i,cnt) are
72 : // NOT BEFORE query.
73 :
74 : ulong
75 : sort_double_descend_split( double const * sorted,
76 : ulong cnt,
77 : double query );
78 :
79 : // sort_double_descend_{stable_fast,stable,inplace}_para is the
80 : // same as // above is adaptively parallelized over the caller
81 : // (typically tpool thread t0) and tpool threads (t0,t1). Assumes
82 : // tpool is valid, tpool threads (t0,t1) are idle (or soon to be
83 : // idle). These are only available if SORT_PARALLEL was requested.
84 :
85 : double *
86 : sort_double_descend_stable_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
87 : double * key,
88 : ulong cnt,
89 : void * scratch );
90 :
91 : double *
92 : sort_double_descend_stable_para( fd_tpool_t * tpool, ulong t0, ulong t1,
93 : double * key,
94 : ulong cnt,
95 : void * scratch );
96 :
97 : double *
98 : sort_double_descend_inplace_para( fd_tpool_t * tpool, ulong t0, ulong t1,
99 : double * key,
100 : ulong cnt );
101 :
102 : // sort_double_descend_fast_para has better asymptotically
103 : // parallelization on average (~(N lg N)/T + lg T) but requires
104 : // more stack space on the caller (T^2). seed gives a random seed
105 : // to use (e.g. fd_tickcount()). If stable is non-zero, a stable
106 : // method will be used (and the resulting order will always be
107 : // deterministic). Otherwise a faster unstable method will be used
108 : // (the order in which equal keys end up in the array will depend
109 : // on seed). Assumes tpool is valid and tpool threads (t0,t1) are
110 : // idle (or soon to be idle). Only available if SORT_PARALLEL was
111 : // requested and the target has FD_HAS_ALLOCA.
112 :
113 : double *
114 : sort_double_descend_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
115 : double * key,
116 : ulong cnt,
117 : void * scratch,
118 : ulong seed,
119 : int stable );
120 :
121 : It is fine to include this template multiple times in a compilation
122 : unit. Just provide the specification before each inclusion. Various
123 : additional options to tune the methods are described below. */
124 :
125 : /* SORT_NAME gives the name of the function to declare (and the base
126 : name of auxiliary and/or variant functions). */
127 :
128 : #ifndef SORT_NAME
129 : #error "SORT_NAME must be defined"
130 : #endif
131 :
132 : /* SORT_KEY_T gives the POD datatype to sort. */
133 :
134 : #ifndef SORT_KEY_T
135 : #error "SORT_KEY_T must be defined"
136 : #endif
137 :
138 : /* SORT_IDX_T gives the data type used to index the arrays */
139 :
140 : #ifndef SORT_IDX_T
141 1748115683 : #define SORT_IDX_T ulong
142 : #endif
143 :
144 : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true. SAFETY TIP:
145 : This is not a 3-way comparison function! */
146 :
147 : #ifndef SORT_BEFORE
148 9120832 : #define SORT_BEFORE(a,b) (a)<(b)
149 : #endif
150 :
151 : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
152 : partition size where insertion sort will be used. Should be at least
153 : 2 (and probably larger than one might expect to get optimal
154 : performance). */
155 :
156 : #ifndef SORT_MERGE_THRESH
157 6340224 : #define SORT_MERGE_THRESH 32
158 : #endif
159 :
160 : #ifndef SORT_QUICK_THRESH
161 : #define SORT_QUICK_THRESH 32
162 : #endif
163 :
164 : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
165 : Roughly, say 1 for floating point keys (see quick method for
166 : details). */
167 :
168 : #ifndef SORT_QUICK_ORDER_STYLE
169 : #define SORT_QUICK_ORDER_STYLE 0
170 : #endif
171 :
172 : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
173 : non-deterministic branch cost to avoid no-op swaps. This is useful
174 : if the key_t has a large memory footprint such that the no-op is
175 : actually a larger cost than a branch mispredict. */
176 :
177 : #ifndef SORT_QUICK_SWAP_MINIMIZE
178 : #define SORT_QUICK_SWAP_MINIMIZE 0
179 : #endif
180 :
181 : /* SORT_PARALLEL will generate thread parallel versions of many sorts.
182 : Requires tpool. */
183 :
184 : #ifndef SORT_PARALLEL
185 : #define SORT_PARALLEL 0
186 : #endif
187 :
188 : /* SORT_OVERSAMPLE_RATIO indicates the amount of oversampling fast
189 : parallel sorts should use to fine pivots. Only relevant if
190 : SORT_PARALLEL is true and FD_HAS_ALLOCA. */
191 :
192 : #ifndef SORT_OVERSAMPLE_RATIO
193 0 : #define SORT_OVERSAMPLE_RATIO 5
194 : #endif
195 :
196 : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
197 :
198 : #ifndef SORT_IDX_IF
199 604539500 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
200 : #endif
201 :
202 : /* SORT_FN_ATTR applies extra function attributes. */
203 :
204 : #ifndef SORT_FN_ATTR
205 : #define SORT_FN_ATTR
206 : #endif
207 :
208 : /* 0 - local use only
209 : 1 - library header declaration
210 : 2 - library implementation */
211 :
212 : #ifndef SORT_IMPL_STYLE
213 : #define SORT_IMPL_STYLE 0
214 : #endif
215 :
216 : /* Implementation *****************************************************/
217 :
218 : #if SORT_IMPL_STYLE==0 /* local use only */
219 : #define SORT_STATIC FD_FN_UNUSED static
220 : #else /* library header and/or implementation */
221 : #define SORT_STATIC
222 : #endif
223 :
224 21988184 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
225 :
226 : #if SORT_IMPL_STYLE!=2 /* need header */
227 :
228 : #if SORT_PARALLEL
229 : #include "../tpool/fd_tpool.h"
230 : #else
231 : #include "../bits/fd_bits.h"
232 : #endif
233 :
234 : FD_PROTOTYPES_BEGIN
235 :
236 : FD_FN_CONST static inline int
237 0 : SORT_(cnt_valid)( SORT_IDX_T cnt ) {
238 0 : /* Written funny for supporting different signed idx types without
239 0 : generating compiler warnings. */
240 0 : SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
241 0 : (ulong)(SORT_IDX_T)((1UL<<57)-1UL) ); /* ==SORT_IDX_MAX as ulong */
242 0 : return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
243 0 : }
244 :
245 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align) ( void ) { return alignof(SORT_KEY_T); }
246 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
247 :
248 : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
249 : SORT_(private_merge)( SORT_KEY_T * key,
250 : long cnt,
251 : SORT_KEY_T * tmp );
252 :
253 : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
254 : SORT_(private_quick)( SORT_KEY_T * key,
255 : SORT_IDX_T cnt );
256 :
257 : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
258 : SORT_(private_select)( SORT_KEY_T * key,
259 : SORT_IDX_T cnt,
260 : SORT_IDX_T rnk );
261 :
262 : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
263 : SORT_(insert)( SORT_KEY_T * key,
264 : SORT_IDX_T cnt );
265 :
266 : static inline SORT_KEY_T *
267 : SORT_(stable_fast)( SORT_KEY_T * key,
268 : SORT_IDX_T cnt,
269 510579 : void * scratch ) {
270 510579 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
271 510240 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
272 510240 : return SORT_(private_merge)( key, (long)cnt, tmp );
273 510579 : }
274 :
275 : static inline SORT_KEY_T *
276 : SORT_(stable)( SORT_KEY_T * key,
277 : SORT_IDX_T cnt,
278 510576 : void * scratch ) {
279 510576 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
280 510240 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
281 510240 : if( SORT_(private_merge)( key, (long)cnt, tmp )==tmp ) /* 50/50 branch prob */
282 105270 : memcpy( key, tmp, sizeof(SORT_KEY_T)*(ulong)cnt );
283 510240 : return key;
284 510576 : }
285 :
286 : static inline SORT_KEY_T *
287 : SORT_(inplace)( SORT_KEY_T * key,
288 4699236 : SORT_IDX_T cnt ) {
289 4699236 : return SORT_(private_quick)( key, cnt );
290 4699236 : }
291 :
292 : static inline SORT_KEY_T *
293 : SORT_(select)( SORT_KEY_T * key,
294 : SORT_IDX_T cnt,
295 231484 : SORT_IDX_T rnk ) {
296 231484 : return SORT_(private_select)( key, cnt, rnk );
297 231484 : }
298 :
299 : static inline SORT_IDX_T
300 : SORT_(split)( SORT_KEY_T const * sorted,
301 : SORT_IDX_T cnt,
302 3372 : SORT_KEY_T query ) {
303 3372 : SORT_IDX_T j = (SORT_IDX_T)0;
304 3372 : SORT_IDX_T k = (SORT_IDX_T)cnt;
305 18069 : for(;;) {
306 18069 : SORT_IDX_T n = k-j;
307 18069 : if( FD_UNLIKELY( n<(SORT_IDX_T)1 ) ) break;
308 :
309 : /* At this point, entries [0,j) are known "lt" query (i.e. before),
310 : entries [j,k) are unknown (this range is non-empty and sorted),
311 : entries [k,n) are known "geq" query (i.e. not before)
312 : Test the entry in the middle of the unknown range. */
313 :
314 14697 : SORT_IDX_T m = j + (n>>1);
315 :
316 14697 : int c = SORT_BEFORE( sorted[ m ], query );
317 :
318 : /* At this point:
319 : If c is 1, entry m is before query. As such, entries [0,m] are all known "lt" query and range (m,k) is unknown now.
320 : If c is 0, entry m is not before query. As such, entries [m,n) are all known "geq" query and range [j,m) is unknown now. */
321 :
322 14697 : j = c ? (m+(SORT_IDX_T)1) : j; /* cmov */
323 14697 : k = c ? k : m; /* cmov */
324 14697 : }
325 :
326 : /* At this point, [0,j) are known "lt" query and [k,n) are known
327 : "geq" query and j==k such that [j,k) is an empty range. */
328 :
329 3372 : return k;
330 3372 : }
331 :
332 : #if SORT_PARALLEL
333 :
334 : SORT_STATIC FD_MAP_REDUCE_PROTO( SORT_(private_merge_para) );
335 : SORT_STATIC FD_FOR_ALL_PROTO ( SORT_(private_memcpy_para) );
336 :
337 : static inline SORT_KEY_T *
338 : SORT_(stable_fast_para)( fd_tpool_t * tpool,
339 : ulong t0,
340 : ulong t1,
341 : SORT_KEY_T * key,
342 : SORT_IDX_T cnt,
343 0 : void * scratch ) {
344 :
345 : /* The wallclock time of the below for N keys and T threads is
346 : roughly:
347 :
348 : alpha N + beta N (ln N - ln T) / T + gamma ln T
349 :
350 : where the first term represents the cost of the final sort rounds
351 : (which are parallelized over increasingly fewer threads than the
352 : initial rounds ... wallclock sums up to something proportional to N
353 : in the limit of large T), the second term represents the cost of
354 : the initial parallel rounds (which are parallelized over T threads)
355 : and the last term represents the wallclock to dispatch and
356 : synchronize the threads. Optimizing T for a given N yields:
357 :
358 : -beta N (ln N - ln T_opt) / T_opt^2 - beta N / T_opt^2 + gamma / T_opt = 0
359 :
360 : Solving for T_opt in the limit N >> T_opt yields:
361 :
362 : T_opt = (beta/gamma) N ln N
363 :
364 : where the ratio beta is roughly the merge pass marginal wallclock
365 : per key and gamma is proportional to the marginal wallclock to
366 : start and stop a thread. Since these are typically used over a
367 : domain of moderate N, we can approximate ln N by ln N_ref,
368 : yielding:
369 :
370 : T_opt ~ N / thresh
371 :
372 : where thresh ~ gamma / (beta ln N_ref). We use an empirical value
373 : such threads will have at least one normal page of work of keys to
374 : process in a block typically. */
375 :
376 0 : if( FD_UNLIKELY( cnt<2L ) ) return key;
377 :
378 0 : static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
379 0 : ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
380 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable_fast)( key, cnt, scratch );
381 0 : t1 = t0 + t_cnt;
382 :
383 0 : SORT_KEY_T * out[1];
384 0 : FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
385 0 : return out[0];
386 0 : }
387 :
388 : static inline SORT_KEY_T *
389 : SORT_(stable_para)( fd_tpool_t * tpool,
390 : ulong t0,
391 : ulong t1,
392 : SORT_KEY_T * key,
393 : SORT_IDX_T cnt,
394 0 : void * scratch ) {
395 :
396 : /* This works the same as the above but does a parallel memcpy if the
397 : result doesn't end up in the correct place. */
398 :
399 0 : if( FD_UNLIKELY( cnt<2L ) ) return key;
400 :
401 0 : static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
402 0 : ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
403 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable)( key, cnt, scratch );
404 0 : t1 = t0 + t_cnt;
405 :
406 0 : SORT_KEY_T * out[1];
407 0 : FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
408 0 : if( out[0]!=key ) FD_FOR_ALL( SORT_(private_memcpy_para), tpool,t0,t1, 0L,(long)cnt, key, scratch ); /* 50/50 branch prob */
409 0 : return key;
410 0 : }
411 :
412 : SORT_FN_ATTR SORT_STATIC void
413 : SORT_(private_quick_node)( void * _tpool,
414 : ulong t0, ulong t1,
415 : void * _args,
416 : void * _reduce, ulong _stride,
417 : ulong _l0, ulong _l1,
418 : ulong _m0, ulong _m1,
419 : ulong _n0, ulong _n1 );
420 :
421 : static inline SORT_KEY_T *
422 : SORT_(inplace_para)( fd_tpool_t * tpool,
423 : ulong t0,
424 : ulong t1,
425 : SORT_KEY_T * key,
426 0 : SORT_IDX_T cnt ) {
427 0 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
428 0 : SORT_(private_quick_node)( tpool,t0,t1, (void *)key, (void *)(ulong)cnt, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
429 0 : return key;
430 0 : }
431 :
432 : #if FD_HAS_ALLOCA
433 :
434 : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
435 : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
436 : SORT_KEY_T * key,
437 : SORT_IDX_T cnt,
438 : void * scratch,
439 : ulong seed,
440 : int stable );
441 :
442 : #endif /* FD_HAS_ALLOCA */
443 :
444 : #endif /* SORT_PARALLEL */
445 :
446 : FD_PROTOTYPES_END
447 :
448 : #endif
449 :
450 : #if SORT_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
451 :
452 : SORT_FN_ATTR SORT_KEY_T *
453 : SORT_(insert)( SORT_KEY_T * key,
454 8873959 : SORT_IDX_T cnt ) {
455 134563797 : for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
456 125689838 : SORT_KEY_T key_i = key[i];
457 125689838 : SORT_IDX_T j = i;
458 965119372 : while( j ) {
459 958785324 : SORT_IDX_T k = j - ((SORT_IDX_T)1);
460 958785324 : SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
461 839429534 : key[j] = key_j;
462 839429534 : j = k;
463 839429534 : }
464 125689838 : key[j] = key_i;
465 125689838 : }
466 8873959 : return key;
467 8873959 : }
468 :
469 : SORT_FN_ATTR static void
470 : SORT_(private_merge_pass)( SORT_KEY_T const * key_l, long cnt_l,
471 : SORT_KEY_T const * key_r, long cnt_r,
472 2659872 : SORT_KEY_T * key_m ) {
473 2659872 : long i = 0L;
474 2659872 : long j = 0L;
475 2659872 : long k = 0L;
476 2659872 : # if 1 /* Minimal C language operations */
477 144635166 : for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
478 144635166 : if( SORT_BEFORE( key_r[j], key_l[i] ) ) {
479 52344009 : key_m[k++] = key_r[j++];
480 52344009 : if( j>=cnt_r ) {
481 170760 : memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) ); /* append left stragglers (at least one) */
482 170760 : break;
483 170760 : }
484 92291157 : } else {
485 92291157 : key_m[k++] = key_l[i++];
486 92291157 : if( i>=cnt_l ) {
487 2489112 : memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) ); /* append right stragglers (at least one) */
488 2489112 : break;
489 2489112 : }
490 92291157 : }
491 144635166 : }
492 : # else /* Branch free variant (Pyth investigations suggested the above is better) */
493 : while( ((i<cnt_l) & (j<cnt_r)) ) {
494 : SORT_KEY_T ki = key_l[ i ];
495 : SORT_KEY_T kj = key_r[ j ];
496 : int use_left = !(SORT_BEFORE( kj, ki ));
497 : key_m[ k ] = use_left ? ki : kj; /* cmov ideally */
498 : i += (long) use_left;
499 : j += (long)!use_left;
500 : k++;
501 : }
502 : if( i<cnt_l ) memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) );
503 : else if( j<cnt_r ) memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) );
504 : # endif
505 2659872 : }
506 :
507 : SORT_FN_ATTR SORT_KEY_T *
508 : SORT_(private_merge)( SORT_KEY_T * key,
509 : long cnt,
510 6340224 : SORT_KEY_T * tmp ) {
511 :
512 : /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
513 :
514 : /* If below threshold, use insertion sort */
515 :
516 6340224 : if( cnt<=((long)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, (SORT_IDX_T)cnt );
517 :
518 : /* Otherwise, break input in half and sort the halves */
519 :
520 2659872 : SORT_KEY_T * key_l = key;
521 2659872 : SORT_KEY_T * tmp_l = tmp;
522 2659872 : long cnt_l = cnt >> 1;
523 2659872 : SORT_KEY_T * in_l = SORT_(private_merge)( key_l, cnt_l, tmp_l );
524 :
525 2659872 : SORT_KEY_T * key_r = key + cnt_l;
526 2659872 : SORT_KEY_T * tmp_r = tmp + cnt_l;
527 2659872 : long cnt_r = cnt - cnt_l;
528 2659872 : SORT_KEY_T * in_r = SORT_(private_merge)( key_r, cnt_r, tmp_r );
529 :
530 : /* Merge in_l / in_r */
531 :
532 2659872 : SORT_KEY_T * out = (in_l==key) ? tmp : key; /* If in_l overlaps with key, merge into tmp. Otherwise, merge into key */
533 :
534 : /* Note that in_l does not overlap with out at this point. in_r might
535 : overlap with the right half of out but the merge pass is fine for
536 : that case. */
537 :
538 2659872 : SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
539 :
540 2659872 : return out;
541 6340224 : }
542 :
543 : /* This uses a dual pivot quick sort for better theoretical and
544 : practical mojo. */
545 :
546 : SORT_FN_ATTR SORT_KEY_T *
547 : SORT_(private_quick)( SORT_KEY_T * key,
548 4699236 : SORT_IDX_T cnt ) {
549 4699236 : SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
550 4699236 : ulong stack_cnt = 0UL;
551 :
552 4699236 : stack[ stack_cnt++ ] = (SORT_IDX_T)0;
553 4699236 : stack[ stack_cnt++ ] = cnt;
554 12226593 : for(;;) {
555 :
556 : /* Pop the next partition to process */
557 :
558 12226593 : if( !stack_cnt ) return key; /* All done */
559 7527357 : SORT_IDX_T h = stack[ --stack_cnt ];
560 7527357 : SORT_IDX_T l = stack[ --stack_cnt ];
561 :
562 : /* [l,h) is the partition we are sorting. If this partition is
563 : small enough, sort it via insertion sort. */
564 :
565 7527357 : SORT_IDX_T n = h-l;
566 7527357 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
567 4145556 : SORT_(insert)( key+l, n );
568 4145556 : continue;
569 4145556 : }
570 :
571 : /* This partition is too large to insertion sort. Pick two pivots,
572 : sort the partition into a left, center and right partition and
573 : then recursively sort those partitions. We initially use a
574 : simple choice of pivots that is ideal for nearly sorted data (in
575 : either direction) with a little extra sampling to improve the
576 : partitioning for randomly ordered data. There is a possibility
577 : that this deterministic choice of pivots will not succeed in
578 : making two or more non-empty partitions; we detect and correct
579 : that below. */
580 :
581 : /* Initial pivot selection */
582 :
583 3381801 : SORT_KEY_T p1;
584 3381801 : SORT_KEY_T p2;
585 3381801 : do {
586 3381801 : SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
587 3381801 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
588 3381801 : SORT_KEY_T p0 = key[ l ];
589 3381801 : /**/ p1 = key[ l + n3 ];
590 3381801 : /**/ p2 = key[ h1 - n3 ];
591 3381801 : SORT_KEY_T p3 = key[ h1 ];
592 3381801 : # if SORT_QUICK_ORDER_STYLE==0
593 : /* Generates better code for integral types (branchless via stack) */
594 3381801 : SORT_KEY_T _k[2]; ulong _c;
595 16909005 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
596 : # else
597 : /* Generates much better code for floating point types (branchless
598 : via min / max floating point instructions) */
599 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
600 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
601 : # endif
602 :
603 3381801 : ORDER(p0,p2); ORDER(p1,p3);
604 3381801 : ORDER(p0,p1); ORDER(p2,p3);
605 3381801 : ORDER(p1,p2);
606 3381801 : # undef ORDER
607 3381801 : } while(0);
608 :
609 3770133 : retry: /* Three way partitioning (silly language restriction) */;
610 3770133 : SORT_IDX_T i = l;
611 3770133 : SORT_IDX_T j = l;
612 3770133 : SORT_IDX_T k = h-(SORT_IDX_T)1;
613 537890262 : do { /* Note [j,k] is non-empty (look ma ... no branches!) */
614 :
615 : /* At this point, p1 <= p2 and:
616 : - keys [l,i) are before p1 (left partition)
617 : - keys [i,j) are in [p1,p2] (center partition)
618 : - keys [j,k] are not yet partitioned (region is non-empty)
619 : - keys (k,h) are after p2 (right partition)
620 : Decide where key j should be moved. */
621 :
622 537890262 : SORT_KEY_T kj = key[j];
623 :
624 537890262 : int to_left = (SORT_BEFORE( kj, p1 ));
625 537890262 : int to_right = (SORT_BEFORE( p2, kj ));
626 537890262 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
627 :
628 : # if SORT_QUICK_SWAP_MINIMIZE
629 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
630 : # else
631 : /**/ key[j] = key[m]; key[m] = kj;
632 537890262 : # endif
633 :
634 537890262 : i += (SORT_IDX_T) to_left;
635 537890262 : j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
636 537890262 : k -= (SORT_IDX_T) to_right;
637 537890262 : } while( j<=k );
638 :
639 : /* Schedule recursion */
640 :
641 3770133 : if( FD_UNLIKELY( (j-i)==n ) ) {
642 :
643 : /* All the keys ended up in the center partition. If p1<p2, we
644 : had an unlucky choice of pivots such that p1==min(key[i]) and
645 : p2==max(key[i]) for i in [l,h). Since we picked the keys
646 : deterministically above, we would have the same bad luck the
647 : next time we processed this partition. But, because there are
648 : at least two distinct elements in the partition, there is a
649 : choice of pivots that will make a non-trivial partition, so we
650 : need to redo this partition with different pivots.
651 :
652 : Randomly picking a new pivot pair in [l,h) has probability
653 : between ~50% and ~100%. The worst case is half the keys in
654 : the partition equal to p1 and the other half equal to p2; the
655 : best case exactly one key is equal to p1 / p2 and all other
656 : keys are in (p1,p2] / [p1,p2).
657 :
658 : The optimal handling of the worst case though is just to set
659 : p1=p2 (p2=p1). This will pull the keys equal to p1 (p2) into
660 : the left (right) partition and the remaining keys into the
661 : center partition; the right (left) partition will be empty.
662 : Thus, this is guaranteed to yield a non-trivial partition of
663 : the center partition but may not yield as load balanced set of
664 : partitions as random for the next iteration. We go with the
665 : deterministic option for simplicity below. */
666 :
667 1550499 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
668 :
669 : /* p1==p2 here ... all keys in this partition are the same
670 : We don't need to recurse further in this partition. */
671 :
672 2219634 : } else {
673 :
674 : /* Between [1,n-1] keys ended up in the center partition such that
675 : at least 1 key ended up in another partition. We push all
676 : three partitions onto the partition stack for recursion. The
677 : order of the pushes is important to bound the maximum stack
678 : usage. Below, we push the center partition followed by the
679 : larger of the left and right partitions then followed by the
680 : smaller of the left and right partitions (such that we process
681 : the smallest next iteration). For this, the same O(log_2 cnt)
682 : max recursion depth applies as quicksort even this does a three
683 : way partitioning. That is, let fl/fc/fr be the fraction of
684 : keys in the left, center, right partitions. At this point, fl
685 : is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
686 : The smaller of the left or right partition is what process next
687 : and, as the smaller of the left or right, it fraction of the
688 : keys is at most 0.5(1-fc)<=0.5. We do push up to two
689 : partitions onto the stack (four SORT_IDX_T) each level of
690 : recursion though instead of one like normal quick sort though. */
691 :
692 2219634 : int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
693 2219634 : SORT_IDX_T l_larger = SORT_IDX_IF( left_larger, l, j );
694 2219634 : SORT_IDX_T h_larger = SORT_IDX_IF( left_larger, i, h );
695 2219634 : SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
696 2219634 : SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
697 :
698 : /* Immediately process empty partitions */
699 2219634 : ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
700 :
701 : /* Immediately process partitions where all keys are the same */
702 2219634 : ulong push_center = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
703 :
704 : /* Guaranteed non-empty (larger of left or right have at least 1 key) */
705 2219634 : ulong push_larger = 1UL;
706 :
707 2219634 : stack[ stack_cnt ] = l_larger; stack_cnt += push_larger;
708 2219634 : stack[ stack_cnt ] = h_larger; stack_cnt += push_larger;
709 2219634 : stack[ stack_cnt ] = i; stack_cnt += push_center;
710 2219634 : stack[ stack_cnt ] = j; stack_cnt += push_center;
711 2219634 : stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
712 2219634 : stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
713 2219634 : }
714 3770133 : }
715 : /* never get here */
716 4699236 : }
717 :
718 : /* This works identical to sort_private_quick. Only differences
719 : relevant to selection are commented (in short, we only recurse on the
720 : partition that could contain rank). See above for comments on the
721 : algo. */
722 :
723 : SORT_FN_ATTR SORT_KEY_T *
724 : SORT_(private_select)( SORT_KEY_T * key,
725 : SORT_IDX_T cnt,
726 231484 : SORT_IDX_T rnk ) {
727 231484 : SORT_IDX_T l = (SORT_IDX_T)0;
728 231484 : SORT_IDX_T h = cnt;
729 748693 : for(;;) {
730 :
731 : /* The partition [l,h) contains rnk. If this partition is small
732 : enough, sort it via insertion sort. FIXME: probably could
733 : truncate the insertion sort early for further optimization (e.g.
734 : if rnk==l / rnk==h-1, we only need to find the min/max for O(n)
735 : operation). */
736 :
737 748693 : SORT_IDX_T n = h-l;
738 748693 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
739 :
740 517233 : SORT_KEY_T p1;
741 517233 : SORT_KEY_T p2;
742 517233 : do {
743 517233 : SORT_IDX_T n3 = n / (SORT_IDX_T)3;
744 517233 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
745 517233 : SORT_KEY_T p0 = key[ l ];
746 517233 : /**/ p1 = key[ l + n3 ];
747 517233 : /**/ p2 = key[ h1 - n3 ];
748 517233 : SORT_KEY_T p3 = key[ h1 ];
749 517233 : # if SORT_QUICK_ORDER_STYLE==0
750 517233 : SORT_KEY_T _k[2]; ulong _c;
751 2586165 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
752 : # else
753 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
754 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
755 : # endif
756 :
757 517233 : ORDER(p0,p2); ORDER(p1,p3);
758 517233 : ORDER(p0,p1); ORDER(p2,p3);
759 517233 : ORDER(p1,p2);
760 517233 : # undef ORDER
761 517233 : } while(0);
762 :
763 517233 : retry: /* Silly language restriction */;
764 517233 : SORT_IDX_T i = l;
765 517233 : SORT_IDX_T j = l;
766 517233 : SORT_IDX_T k = h-(SORT_IDX_T)1;
767 57770702 : do {
768 :
769 57770702 : SORT_KEY_T kj = key[j];
770 :
771 57770702 : int to_left = (SORT_BEFORE( kj, p1 ));
772 57770702 : int to_right = (SORT_BEFORE( p2, kj ));
773 57770702 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
774 :
775 : # if SORT_QUICK_SWAP_MINIMIZE
776 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
777 : # else
778 : /**/ key[j] = key[m]; key[m] = kj;
779 57770702 : # endif
780 :
781 57770702 : i += (SORT_IDX_T) to_left;
782 57770702 : j += (SORT_IDX_T)(to_right^1);
783 57770702 : k -= (SORT_IDX_T) to_right;
784 57770702 : } while( j<=k );
785 :
786 517233 : if( FD_UNLIKELY( (j-i)==n ) ) {
787 :
788 24 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
789 :
790 : /* p1==p2 here ... all keys in this partition are the same. We
791 : don't need to recurse further in this partition as all keys
792 : would do here. */
793 :
794 24 : return key;
795 24 : }
796 :
797 : /* At this point:
798 : - [l,i) is the left partition,
799 : - [i,j) is the center partition
800 : - [j,h) is the right partition
801 : Between [1,n-1] keys ended up in the center partition such that
802 : at least 1 key ended up in another partition. Recurse on the
803 : partition that contains rnk. As this is typically used in median
804 : finding, we assume that the center partition is the most likely
805 : in the below selection (this can also be done branchlessly). */
806 :
807 517209 : SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
808 517209 : if( FD_UNLIKELY( rnk< i ) ) l_next = l, h_next = i;
809 517209 : if( FD_UNLIKELY( j <=rnk ) ) l_next = j, h_next = h;
810 517209 : l = l_next; h = h_next;
811 517209 : }
812 : /* never get here */
813 231484 : }
814 :
815 : #if SORT_PARALLEL
816 :
817 : /* This works identical to sort_private_quick. Only differences
818 : relevant to parallelization are commented. See above for comments on
819 : the algo. */
820 :
821 : SORT_FN_ATTR void
822 : SORT_(private_quick_node)( void * _tpool,
823 : ulong t0, ulong t1,
824 : void * _args,
825 : void * _reduce, ulong _stride,
826 : ulong _l0, ulong _l1,
827 : ulong _m0, ulong _m1,
828 0 : ulong _n0, ulong _n1 ) {
829 0 : (void)_stride; (void)_l0; (void)_l1; (void)_m0; (void)_m1; (void)_n0; (void)_n1;
830 :
831 0 : fd_tpool_t * tpool = (fd_tpool_t *) _tpool;
832 0 : SORT_KEY_T * key = (SORT_KEY_T *) _args;
833 0 : SORT_IDX_T cnt = (SORT_IDX_T)(ulong)_reduce;
834 :
835 0 : SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ];
836 0 : ulong stack_cnt = 0UL;
837 :
838 0 : ulong wait_stack[ 82 ]; /* See note below for sizing considerations */
839 0 : ulong wait_stack_cnt = 0UL;
840 :
841 0 : stack[ stack_cnt++ ] = (SORT_IDX_T)0;
842 0 : stack[ stack_cnt++ ] = cnt;
843 :
844 0 : while( stack_cnt ) {
845 :
846 0 : SORT_IDX_T h = stack[ --stack_cnt ];
847 0 : SORT_IDX_T l = stack[ --stack_cnt ];
848 :
849 0 : SORT_IDX_T n = h-l;
850 0 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
851 0 : SORT_(insert)( key+l, n );
852 0 : continue;
853 0 : }
854 :
855 0 : SORT_KEY_T p1;
856 0 : SORT_KEY_T p2;
857 0 : do {
858 0 : SORT_IDX_T n3 = n / (SORT_IDX_T)3;
859 0 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
860 0 : SORT_KEY_T p0 = key[ l ];
861 0 : /**/ p1 = key[ l + n3 ];
862 0 : /**/ p2 = key[ h1 - n3 ];
863 0 : SORT_KEY_T p3 = key[ h1 ];
864 0 : # if SORT_QUICK_ORDER_STYLE==0
865 0 : SORT_KEY_T _k[2]; ulong _c;
866 0 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
867 : # else
868 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
869 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
870 : # endif
871 :
872 0 : ORDER(p0,p2); ORDER(p1,p3);
873 0 : ORDER(p0,p1); ORDER(p2,p3);
874 0 : ORDER(p1,p2);
875 0 : # undef ORDER
876 0 : } while(0);
877 :
878 0 : retry: /* Silly language restriction */;
879 0 : SORT_IDX_T i = l;
880 0 : SORT_IDX_T j = l;
881 0 : SORT_IDX_T k = h-(SORT_IDX_T)1;
882 0 : do {
883 0 : SORT_KEY_T kj = key[j];
884 :
885 0 : int to_left = (SORT_BEFORE( kj, p1 ));
886 0 : int to_right = (SORT_BEFORE( p2, kj ));
887 0 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
888 :
889 : # if SORT_QUICK_SWAP_MINIMIZE
890 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
891 : # else
892 : /**/ key[j] = key[m]; key[m] = kj;
893 0 : # endif
894 :
895 0 : i += (SORT_IDX_T) to_left;
896 0 : j += (SORT_IDX_T)(to_right^1);
897 0 : k -= (SORT_IDX_T) to_right;
898 0 : } while( j<=k );
899 :
900 0 : if( FD_UNLIKELY( (j-i)==n ) ) {
901 :
902 0 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
903 :
904 0 : } else {
905 :
906 : /* At this point, we have at most 3 partitions to sort and can use
907 : the caller and tpool threads (t0,t1) to sort them. To load
908 : balance this, we sort the partitions by an estimate of how much
909 : work each requires. We estimate this as proportional to:
910 :
911 : work = cnt ceil log_3 cnt if cnt>0 and 0 otherwise
912 :
913 : where cnt is the number of keys in the partition. Since this
914 : is a rough estimate used for load balancing, we don't need a
915 : precise log_3 calculation. With log_3 x ~ 0.633 lg x, we
916 : approximate:
917 :
918 : ceil log_3 x ~ ceil( 5 (lg x) / 8 )
919 : ~ ceil( 5 (floor lg x) / 8)
920 : = floor( (5 (floor lg x) + 7)/8 )
921 : = (5 (msb x) + 7) >> 3
922 :
923 : This approximation is trivial to compute, is monotonic and is
924 : either exact or 1 less for x in (0,2^64). The approximation is
925 : such that cnt==1 / cnt>1 yields zero / positive. As such,
926 : work==0 strictly indicates no additional work is necessary. */
927 :
928 0 : # define WORK_APPROX(cnt) (((ulong)(cnt))*(ulong)((5*fd_ulong_find_msb_w_default( (ulong)(cnt), 0 )+7) >> 3))
929 :
930 0 : struct { SORT_IDX_T l; SORT_IDX_T h; ulong work; } part[3];
931 :
932 0 : part[0].l = l; part[0].h = i; part[0].work = WORK_APPROX( i-l );
933 0 : part[1].l = i; part[1].h = j; part[1].work = fd_ulong_if( SORT_BEFORE( p1, p2 ), WORK_APPROX( j-i ), 0UL );
934 0 : part[2].l = j; part[2].h = h; part[2].work = WORK_APPROX( h-j );
935 :
936 0 : # undef WORK_APPROX
937 :
938 0 : ulong work_remain = part[0].work + part[1].work + part[2].work;
939 :
940 0 : # define ORDER(i,j) fd_swap_if( part[j].work > part[i].work, part[i], part[j] )
941 :
942 0 : ORDER(0,2); ORDER(0,1); ORDER(1,2);
943 :
944 0 : # undef ORDER
945 :
946 0 : for( ulong idx=0UL; idx<3UL; idx++ ) {
947 :
948 : /* At this point, we need to schedule partitions [idx,2] requiring
949 : work_remain total estimated work and we have the caller and
950 : tpool threads (t0,t1) to do this work. Compute which threads
951 : should execute partition idx. */
952 :
953 0 : ulong p_work = part[idx].work;
954 0 : if( FD_UNLIKELY( !p_work ) ) break; /* no work need for this partition and remainin partitions */
955 :
956 0 : SORT_IDX_T p_l = part[idx].l;
957 0 : SORT_IDX_T p_h = part[idx].h;
958 :
959 0 : ulong t_cnt = t1 - t0;
960 0 : if( FD_UNLIKELY( (t_cnt>1UL) & (p_work<work_remain) ) ) {
961 :
962 : /* At this point, we have at least two partitions that need
963 : work to sort and at least two threads remaining for
964 : sorting. Schedule this partition for execution on between
965 : 1 and t_cnt-1 of the remaining threads roughly proportional
966 : to the estimated work to sort this partition versus the
967 : total estimated work remaining.
968 :
969 : For the wait stack, in the worst case, we schedule two
970 : partitions for remote execution covering two thirds of the
971 : keys each round. This implies the wait stack depth is
972 : bounded by:
973 :
974 : 2 ceil log_3 thread_cnt
975 :
976 : Practically, thread_cnt<=FD_TILE_MAX. For
977 : FD_TILE_MAX==1024, 14 is sufficient and 82 is sufficient
978 : for any thread_cnt representable by a ulong. */
979 :
980 0 : ulong ts = t1 - fd_ulong_min( fd_ulong_max( (t_cnt*p_work + (work_remain>>1)) / work_remain, 1UL ), t_cnt-1UL );
981 :
982 0 : fd_tpool_exec( tpool, ts, SORT_(private_quick_node), tpool, ts, t1, key + p_l, (void *)(ulong)(p_h - p_l),
983 0 : 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
984 :
985 0 : wait_stack[ wait_stack_cnt++ ] = ts;
986 :
987 : /* Advance t1 and work_remain for the next iteration */
988 :
989 0 : t1 = ts;
990 0 : work_remain -= p_work;
991 :
992 0 : } else {
993 :
994 : /* At this point, we have only one thread available and/or
995 : this is the only partition remaining that needs sorting.
996 : Schedule this partition for local execution. */
997 :
998 0 : stack[ stack_cnt++ ] = p_l;
999 0 : stack[ stack_cnt++ ] = p_h;
1000 :
1001 0 : }
1002 0 : }
1003 0 : }
1004 0 : }
1005 :
1006 : /* Wait for any children to finish */
1007 :
1008 0 : while( wait_stack_cnt ) fd_tpool_wait( tpool, wait_stack[ --wait_stack_cnt ] );
1009 0 : }
1010 :
1011 0 : FD_MAP_REDUCE_BEGIN( SORT_(private_merge_para), 1L, 0UL, sizeof(SORT_KEY_T *), 1L ) {
1012 :
1013 0 : SORT_KEY_T ** out = (SORT_KEY_T **)arg[0];
1014 0 : SORT_KEY_T * key = (SORT_KEY_T * )arg[1];
1015 0 : SORT_KEY_T * tmp = (SORT_KEY_T * )arg[2];
1016 :
1017 0 : *out = SORT_(stable_fast)( key + block_i0, (SORT_IDX_T)block_cnt, tmp + block_i0 );
1018 :
1019 0 : } FD_MAP_END {
1020 :
1021 0 : SORT_KEY_T * key = (SORT_KEY_T * )arg[1];
1022 0 : SORT_KEY_T * tmp = (SORT_KEY_T * )arg[2];
1023 0 : SORT_KEY_T ** _in_l = (SORT_KEY_T **)arg[0]; SORT_KEY_T * in_l = *_in_l; long cnt_l = block_is - block_i0;
1024 0 : SORT_KEY_T ** _in_r = (SORT_KEY_T **)_r1; SORT_KEY_T * in_r = *_in_r; long cnt_r = block_i1 - block_is;
1025 :
1026 : /* Merge in_l / in_r (see private_merge above for details) */
1027 :
1028 0 : SORT_KEY_T * out = ((in_l==(key+block_i0)) ? tmp : key) + block_i0; /* cmov */
1029 :
1030 0 : SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
1031 :
1032 0 : *_in_l = out;
1033 :
1034 0 : } FD_REDUCE_END
1035 :
1036 0 : FD_FOR_ALL_BEGIN( SORT_(private_memcpy_para), 1L ) {
1037 :
1038 0 : SORT_KEY_T * dst = (SORT_KEY_T *)arg[0];
1039 0 : SORT_KEY_T const * src = (SORT_KEY_T const *)arg[1];
1040 :
1041 0 : if( FD_LIKELY( block_cnt ) ) memcpy( dst + block_i0, src + block_i0, sizeof(SORT_KEY_T)*(ulong)block_cnt );
1042 :
1043 0 : } FD_FOR_ALL_END
1044 :
1045 : #if FD_HAS_ALLOCA
1046 :
1047 0 : static FD_FOR_ALL_BEGIN( SORT_(private_cntcpy_para), 1L ) {
1048 0 : ulong tpool_base = (ulong )arg[0];
1049 0 : ulong t_cnt = (ulong )arg[1];
1050 0 : ulong * _key_cnt = (ulong *)arg[2];
1051 0 : SORT_KEY_T const * key = (SORT_KEY_T const *)arg[3];
1052 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)arg[4];
1053 0 : SORT_KEY_T const * pivot = (SORT_KEY_T const *)arg[5];
1054 :
1055 : /* Note keys in [ pivot[t-1], pivot[t] ) for t in [0,t_cnt) are keys
1056 : assigned to thread t for sorting. pivot[-1] / pivot[t_cnt-1] are
1057 : an implied -infinity / +infinity and never explicitly accessed. As
1058 : such, pivot is indexed [0,t_cnt-1). */
1059 :
1060 : /* Allocate and clear a local scratch for counting. We don't count
1061 : directly into key_cnt to avoid false sharing while counting. */
1062 :
1063 0 : ulong * key_cnt = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
1064 :
1065 0 : memset( key_cnt, 0, sizeof(ulong)*t_cnt );
1066 :
1067 0 : for( long i=block_i0; i<block_i1; i++ ) {
1068 0 : SORT_KEY_T ki = key[i];
1069 :
1070 : /* Determine which thread is responsible for subsorting this key */
1071 : /* FIXME: ideally, use a unrolled fixed count iteration for here */
1072 :
1073 0 : ulong l = 0UL;
1074 0 : ulong h = t_cnt;
1075 0 : for(;;) {
1076 0 : ulong n = h - l;
1077 0 : if( n<2UL ) break;
1078 :
1079 : /* At this point, the thread for key i is in [l,h) and this range
1080 : contains at least two threads. Split this range in half. */
1081 :
1082 0 : ulong m = l + (n>>1); /* In [1,t_cnt) */
1083 0 : int c = SORT_BEFORE( ki, pivot[m-1UL] );
1084 :
1085 : /* If ki is before pivot[m], the target thread is in [l,m).
1086 : Otherwise, the target thread is in [m,h). */
1087 :
1088 0 : l = fd_ulong_if( c, l, m );
1089 0 : h = fd_ulong_if( c, m, h );
1090 0 : }
1091 :
1092 : /* At this point, key[i] should be assigned to thread l. Count it
1093 : and copy it into tmp to prepare to scatter. */
1094 :
1095 0 : key_cnt[l]++;
1096 0 : tmp[i] = ki;
1097 0 : }
1098 :
1099 : /* Send the counts back to main thread for partitioning */
1100 :
1101 0 : memcpy( _key_cnt + (tpool_t0-tpool_base)*t_cnt, key_cnt, sizeof(ulong)*t_cnt );
1102 :
1103 0 : } FD_FOR_ALL_END
1104 :
1105 0 : static FD_FOR_ALL_BEGIN( SORT_(private_scatter_para), 1L ) {
1106 0 : ulong tpool_base = (ulong )arg[0];
1107 0 : ulong t_cnt = (ulong )arg[1];
1108 0 : ulong const * _part = (ulong const *)arg[2];
1109 0 : SORT_KEY_T * key = (SORT_KEY_T *)arg[3];
1110 0 : SORT_KEY_T const * tmp = (SORT_KEY_T const *)arg[4];
1111 0 : SORT_KEY_T const * pivot = (SORT_KEY_T const *)arg[5];
1112 :
1113 : /* Receive the key array partitioning from the main thread. Like the
1114 : above, we don't operate on the part array directly to avoid false
1115 : sharing while scattering. */
1116 :
1117 0 : ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
1118 :
1119 0 : memcpy( part, _part + (tpool_t0-tpool_base)*t_cnt, sizeof(ulong)*t_cnt );
1120 :
1121 0 : for( long i=block_i0; i<block_i1; i++ ) {
1122 0 : SORT_KEY_T ki = tmp[i];
1123 :
1124 : /* Determine which thread is responsible for subsort this key
1125 : (again). Identical considerations to the above. Note: we can
1126 : eliminate this computation if willing to burn O(N) additional
1127 : scratch memory by saving results computed above for use here. */
1128 :
1129 0 : ulong l = 0UL;
1130 0 : ulong h = t_cnt;
1131 0 : for(;;) {
1132 0 : ulong n = h - l;
1133 0 : if( n<2UL ) break;
1134 0 : ulong m = l + (n>>1);
1135 0 : int c = SORT_BEFORE( ki, pivot[m-1UL] );
1136 0 : l = fd_ulong_if( c, l, m );
1137 0 : h = fd_ulong_if( c, m, h );
1138 0 : }
1139 :
1140 : /* Send this key to target thread */
1141 :
1142 0 : key[ part[l]++ ] = ki;
1143 0 : }
1144 :
1145 0 : } FD_FOR_ALL_END
1146 :
1147 0 : static FD_FOR_ALL_BEGIN( SORT_(private_subsort_para), 1L ) {
1148 0 : ulong const * part = (ulong const *)arg[0];
1149 0 : SORT_KEY_T * key = (SORT_KEY_T *)arg[1];
1150 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)arg[2];
1151 0 : int stable = (int )arg[3];
1152 :
1153 0 : ulong j0 = part[ block_i0 ];
1154 0 : ulong j1 = part[ block_i1 ];
1155 :
1156 0 : if( stable ) SORT_(stable) ( key + j0, j1 - j0, tmp + j0 );
1157 0 : else SORT_(inplace)( key + j0, j1 - j0 );
1158 :
1159 0 : } FD_FOR_ALL_END
1160 :
1161 : SORT_FN_ATTR SORT_KEY_T *
1162 : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
1163 : SORT_KEY_T * key,
1164 : SORT_IDX_T cnt,
1165 : void * scratch,
1166 : ulong seed,
1167 0 : int stable ) {
1168 :
1169 0 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key; /* nothing to do */
1170 :
1171 : /* For the below (if sampling ops are fully threaded), sampling costs
1172 : O(S), the sample sort costs O(S lg(T S)), the downsampling costs
1173 : O(1), the partitioning costs O(T), the counting and scattering cost
1174 : O(N (lg T) / T), the subsorts cost (N/T) lg(N/T) and thread
1175 : synchronization overhead is O(lg T).
1176 :
1177 : Combining all these, assuming the sampling ratio S is a fixed O(1)
1178 : quantity and assuming N (lg T) / T term in the counting/scattering
1179 : approximately cancals the -N lg T / T in the subsorts yields a
1180 : wallclock that scales as:
1181 :
1182 : alpha N (ln N) / T + beta ln T + gamma T
1183 :
1184 : The first term represents the parallelized sorting cost, the second
1185 : term represents the thread dispatch / sync cost, and the last term
1186 : represents the partitioning costs. Minimizing cost with respect
1187 : to T yields:
1188 :
1189 : -alpha (N ln N) / T_opt^2 + beta / T_opt + gamma = 0
1190 :
1191 : whose (positive) solution is:
1192 :
1193 : T_opt ~ (0.5 beta / gamma) [ sqrt( 1 + (4 alpha gamma N ln N)/beta^2) ) - 1 ]
1194 :
1195 : In the limit 4 alpha gamma N ln N / beta^2 >> 1 (that is,
1196 : partitioning costs are much lower than thread start/stop costs), we
1197 : have:
1198 :
1199 : T_opt -> (alpha N ln N) / beta
1200 :
1201 : In the other limit, we have:
1202 :
1203 : T_opt -> sqrt( alpha N ln N / gamma )
1204 :
1205 : The first limit tends to apply in practice. Thus we use:
1206 :
1207 : T_opt ~ (N lg N) / thresh
1208 : ~ floor( (N msb N + N/2 + thresh/2) / thresh)
1209 :
1210 : where thresh is an empirical minimum amount of sorting work to
1211 : justify starting / stopping a thread. (As written below, the gamma
1212 : term is more like gamma T^2, which makes the other limit more like
1213 : the cube root of N ln N but doesn't change the overall
1214 : conclusion.) */
1215 :
1216 0 : ulong thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T);
1217 0 : ulong t_cnt = fd_ulong_min( t1 - t0, (cnt*(ulong)fd_ulong_find_msb( cnt ) + ((cnt+thresh)>>1)) / thresh );
1218 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return stable ? SORT_(stable)( key, cnt, scratch ) : SORT_(inplace)( key, cnt );
1219 0 : t1 = t0 + t_cnt;
1220 :
1221 : /* At this point, we have at least 2 threads available and at least 2
1222 : items to sort. Sample the keys to get some idea of their
1223 : distribution, sort the samples and downsample the sorted samples
1224 : into pivots that approximately uniformly partition the samples into
1225 : t_cnt groups (and thus approximately partition the keys uniformly
1226 : too). Notes:
1227 :
1228 : - The sampling below is practically uniform IID but it could be
1229 : made more robust with a rejection method (ala fd_rng_ulong_roll).
1230 :
1231 : - Increasing SORT_OVERSAMPLE_RATIO improves the uniformity of the
1232 : partitioning of key space over the inputs but requires more stack
1233 : scratch memory and more overhead.
1234 :
1235 : - All three steps here could be parallelized but it is probably not
1236 : worth it given the overhead for threaded versus the number of
1237 : samples.
1238 :
1239 : - For a stable sort, the result is completely deterministic even
1240 : if the seed provided is non-deterministic. */
1241 :
1242 0 : ulong sample_cnt = t_cnt*(ulong)SORT_OVERSAMPLE_RATIO;
1243 0 : SORT_KEY_T * pivot = fd_alloca( alignof(ulong), sizeof(ulong)*sample_cnt );
1244 :
1245 0 : for( ulong i=0UL; i<sample_cnt; i++ ) pivot[i] = key[ fd_ulong_hash( seed ^ i ) % cnt ];
1246 :
1247 0 : SORT_(inplace)( pivot, sample_cnt );
1248 :
1249 0 : for( ulong i=1UL; i<t_cnt; i++ ) pivot[i-1UL] = pivot[ i*(ulong)SORT_OVERSAMPLE_RATIO ];
1250 :
1251 : /* At this point, keys in [ pivot[t-1], pivot[t] ) should be assigned
1252 : to thread t for thread t's subsort. pivot[-1] / pivot[t_cnt-1] are
1253 : an implicit -infinity / +infinity. Split the input array equally
1254 : over threads and have each thread copy their block of keys into tmp
1255 : and count the number of keys in its block that could be assigned to
1256 : each thread. */
1257 :
1258 0 : # define part(i,j) part[ (i) + t_cnt*(j) ]
1259 :
1260 0 : ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt*t_cnt );
1261 :
1262 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
1263 :
1264 0 : FD_FOR_ALL( SORT_(private_cntcpy_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
1265 :
1266 : /* At this point, part(i,j) is the count of the number of keys in thread
1267 : j's block that were assigned to thread i's subsort. Convert this
1268 : into a partitioning. In-principle this can be parallelized but
1269 : this is rarely worth it practically. */
1270 :
1271 0 : ulong k = 0UL;
1272 0 : for( ulong i=0UL; i<t_cnt; i++ ) {
1273 0 : for( ulong j=0UL; j<t_cnt; j++ ) {
1274 0 : ulong c_ij = part(i,j);
1275 0 : part(i,j) = k;
1276 0 : k += c_ij;
1277 0 : }
1278 0 : }
1279 :
1280 : /* At this point, the range [ part(i,j), part(i+1,j) ) is where keys
1281 : in thread j's block assigned to thread i's subsort should be
1282 : scattered. part(t_cnt,t_cnt-1) is an implied cnt. Scatter the
1283 : keys from tmp back into key in subsorted order. */
1284 :
1285 0 : FD_FOR_ALL( SORT_(private_scatter_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
1286 :
1287 : /* At this point, keys [ part(i,0), part(i+1,0) ) are the keys assigned
1288 : to thread i for subsorting. part(t_cnt,0) is an implied cnt.
1289 : Since t_cnt>1 though this location exists. We make that explicit
1290 : such that part[i],part[i+1] give the sets of keys each thread
1291 : should sort. Do the thread parallel subsorts to get the keys into
1292 : final sorted order. */
1293 :
1294 0 : part[ t_cnt ] = cnt;
1295 :
1296 0 : FD_FOR_ALL( SORT_(private_subsort_para), tpool,t0,t1, 0L,(long)t_cnt, part, key, tmp, stable );
1297 :
1298 0 : # undef part
1299 :
1300 0 : return key;
1301 0 : }
1302 :
1303 : #endif /* FD_HAS_ALLOCA */
1304 : #endif /* SORT_PARALLEL */
1305 : #endif /* SORT_IMPL_STYLE!=1 */
1306 :
1307 : #undef SORT_
1308 : #undef SORT_STATIC
1309 :
1310 : #undef SORT_IMPL_STYLE
1311 : #undef SORT_FN_ATTR
1312 : #undef SORT_IDX_IF
1313 : #undef SORT_OVERSAMPLE_RATIO
1314 : #undef SORT_PARALLEL
1315 : #undef SORT_QUICK_SWAP_MINIMIZE
1316 : #undef SORT_QUICK_ORDER_STYLE
1317 : #undef SORT_QUICK_THRESH
1318 : #undef SORT_MERGE_THRESH
1319 : #undef SORT_BEFORE
1320 : #undef SORT_IDX_T
1321 : #undef SORT_KEY_T
1322 : #undef SORT_NAME
|