Line data Source code
1 : /* Declares a family of functions useful for single threaded sorting of
2 : POD types in high performance contexts. Example usage:
3 :
4 : #define SORT_NAME sort_double_descend
5 : #define SORT_KEY_T double
6 : #define SORT_BEFORE(a,b) (a)>(b)
7 : #include "util/tmpl/fd_sort.c"
8 :
9 : will create the following API for use in the local compile unit:
10 :
11 : // Sort key[i] for i in [0,cnt) stable in place in a best / average
12 : // worst case of O(N) / O(N^2) / O(N^2) operations.
13 :
14 : double *
15 : sort_double_descend_insert( double * key,
16 : ulong cnt );
17 :
18 : // Return 1 if cnt is a sensible value for the sorting APIs
19 : // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow,
20 : // 3*cnt*ceil(log_3(cnt)) will not overflow, etc).
21 :
22 : int sort_double_descend_cnt_valid( ulong cnt );
23 :
24 : // Return the alignment and footprint required for a scratch region
25 : // adequate for sorting up to cnt elements. The results will be
26 : // standard allocator and standard declaration friendly. (E.g. a
27 : // declaration "double scratch[ cnt ];" will be fine as a scratch
28 : // region.)
29 :
30 : ulong sort_double_descend_stable_scratch_align ( void );
31 : ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
32 :
33 : // Sort key[i] for i in [0,cnt) stable in best / average / worst
34 : // case of O(N lg N) / O(N lg N) / O(N lg N) operations. Scratch
35 : // is a scratch workspace of suitable alignment and footprint.
36 : // Returns where the sorted values ended up. Will be at either key
37 : // or (double *)scratch.
38 :
39 : double *
40 : sort_double_descend_stable_fast( double * key,
41 : ulong cnt,
42 : void * scratch );
43 :
44 : // Same as above but does additional copying if necessary such that
45 : // the final result ends up in key. Returns key.
46 :
47 : double *
48 : sort_double_descend_stable( double * key,
49 : ulong cnt,
50 : void * scratch );
51 :
52 : // Sort key[i] for i in [0,cnt) inplace in best / average / worst
53 : // case of O(N lg N) / O(N lg N) / O(N^2) operations. Returns key.
54 :
55 : double *
56 : sort_double_descend_inplace( double * key,
57 : ulong cnt );
58 :
59 : // Partially sort key[i] such that key[rnk] holds rnk and return
60 : // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
61 : // operations. Returns key. Assumes key is non-NULL, cnt is valid
62 : // and positive and rnk is in [0,cnt)
63 :
64 : double *
65 : sort_double_descend_select( double * key,
66 : ulong cnt,
67 : ulong rnk );
68 :
69 : // Binary search for element greater or equal to key in a sorted
70 : // list. Return value is an index into sorted in [0,cnt]. If
71 : // return value > 0 then it is the largest index where
72 : // SORT_BEFORE(sorted[index-1],query)==1. Returns 0 if
73 : // SORT_BEFORE(query,elem)==1 for all elements in sorted. Assumes
74 : // SORT_BEFORE defines a total order over sorted, and each element
75 : // in sorted is less-or-equal than its successor. (Returns
76 : // incorrect result otherwise!) Robust if cnt==0 (in that case,
77 : // returns 0 and ignores pointer to sorted).
78 :
79 : ulong
80 : sort_double_descend_search_geq( double const * sorted,
81 : ulong cnt,
82 : double query );
83 :
84 : // sort_double_descend_{stable_fast,stable,inplace}_para is the
85 : // same as // above is adaptively parallelized over the caller
86 : // (typically tpool thread t0) and tpool threads (t0,t1). Assumes
87 : // tpool is valid, tpool threads (t0,t1) are idle (or soon to be
88 : // idle). These are only available if SORT_PARALLEL was requested.
89 :
90 : double *
91 : sort_double_descend_stable_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
92 : double * key,
93 : ulong cnt,
94 : void * scratch );
95 :
96 : double *
97 : sort_double_descend_stable_para( fd_tpool_t * tpool, ulong t0, ulong t1,
98 : double * key,
99 : ulong cnt,
100 : void * scratch );
101 :
102 : double *
103 : sort_double_descend_inplace_para( fd_tpool_t * tpool, ulong t0, ulong t1,
104 : double * key,
105 : ulong cnt );
106 :
107 : // sort_double_descend_fast_para has better asymptotically
108 : // parallelization on average (~(N lg N)/T + lg T) but requires
109 : // more stack space on the caller (T^2). seed gives a random seed
110 : // to use (e.g. fd_tickcount()). If stable is non-zero, a stable
111 : // method will be used (and the resulting order will always be
112 : // deterministic). Otherwise a faster unstable method will be used
113 : // (the order in which equal keys end up in the array will depend
114 : // on seed). Assumes tpool is valid and tpool threads (t0,t1) are
115 : // idle (or soon to be idle). Only available if SORT_PARALLEL was
116 : // requested and the target has FD_HAS_ALLOCA.
117 :
118 : double *
119 : sort_double_descend_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
120 : double * key,
121 : ulong cnt,
122 : void * scratch,
123 : ulong seed,
124 : int stable );
125 :
126 : It is fine to include this template multiple times in a compilation
127 : unit. Just provide the specification before each inclusion. Various
128 : additional options to tune the methods are described below. */
129 :
130 : /* SORT_NAME gives the name of the function to declare (and the base
131 : name of auxiliary and/or variant functions). */
132 :
133 : #ifndef SORT_NAME
134 : #error "SORT_NAME must be defined"
135 : #endif
136 :
137 : /* SORT_KEY_T gives the POD datatype to sort. */
138 :
139 : #ifndef SORT_KEY_T
140 : #error "SORT_KEY_T must be defined"
141 : #endif
142 :
143 : /* SORT_IDX_T gives the data type used to index the arrays */
144 :
145 : #ifndef SORT_IDX_T
146 1664241865 : #define SORT_IDX_T ulong
147 : #endif
148 :
149 : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true. SAFETY TIP:
150 : This is not a 3-way comparison function! */
151 :
152 : #ifndef SORT_BEFORE
153 9137072 : #define SORT_BEFORE(a,b) (a)<(b)
154 : #endif
155 :
156 : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
157 : partition size where insertion sort will be used. Should be at least
158 : 2 (and probably larger than one might expect to get optimal
159 : performance). */
160 :
161 : #ifndef SORT_MERGE_THRESH
162 6340224 : #define SORT_MERGE_THRESH 32
163 : #endif
164 :
165 : #ifndef SORT_QUICK_THRESH
166 : #define SORT_QUICK_THRESH 32
167 : #endif
168 :
169 : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
170 : Roughly, say 1 for floating point keys (see quick method for
171 : details). */
172 :
173 : #ifndef SORT_QUICK_ORDER_STYLE
174 : #define SORT_QUICK_ORDER_STYLE 0
175 : #endif
176 :
177 : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
178 : non-deterministic branch cost to avoid no-op swaps. This is useful
179 : if the key_t has a large memory footprint such that the no-op is
180 : actually a larger cost than a branch mispredict. */
181 :
182 : #ifndef SORT_QUICK_SWAP_MINIMIZE
183 : #define SORT_QUICK_SWAP_MINIMIZE 0
184 : #endif
185 :
186 : /* SORT_PARALLEL will generate thread parallel versions of many sorts.
187 : Requires tpool. */
188 :
189 : #ifndef SORT_PARALLEL
190 : #define SORT_PARALLEL 0
191 : #endif
192 :
193 : /* SORT_OVERSAMPLE_RATIO indicates the amount of oversampling fast
194 : parallel sorts should use to fine pivots. Only relevant if
195 : SORT_PARALLEL is true and FD_HAS_ALLOCA. */
196 :
197 : #ifndef SORT_OVERSAMPLE_RATIO
198 0 : #define SORT_OVERSAMPLE_RATIO 5
199 : #endif
200 :
201 : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
202 :
203 : #ifndef SORT_IDX_IF
204 575945518 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
205 : #endif
206 :
207 : /* 0 - local use only
208 : 1 - library header declaration
209 : 2 - library implementation */
210 :
211 : #ifndef SORT_IMPL_STYLE
212 : #define SORT_IMPL_STYLE 0
213 : #endif
214 :
215 : /* Implementation *****************************************************/
216 :
217 : #if SORT_IMPL_STYLE==0 /* local use only */
218 : #define SORT_STATIC FD_FN_UNUSED static
219 : #else /* library header and/or implementation */
220 : #define SORT_STATIC
221 : #endif
222 :
223 16337550 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
224 :
225 : #if SORT_IMPL_STYLE!=2 /* need header */
226 :
227 : #if SORT_PARALLEL
228 : #include "../tpool/fd_tpool.h"
229 : #else
230 : #include "../bits/fd_bits.h"
231 : #endif
232 :
233 : FD_PROTOTYPES_BEGIN
234 :
235 : FD_FN_CONST static inline int
236 0 : SORT_(cnt_valid)( SORT_IDX_T cnt ) {
237 0 : /* Written funny for supporting different signed idx types without
238 0 : generating compiler warnings. */
239 0 : SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
240 0 : (ulong)(SORT_IDX_T)((1UL<<57)-1UL) ); /* ==SORT_IDX_MAX as ulong */
241 0 : return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
242 0 : }
243 :
244 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align) ( void ) { return alignof(SORT_KEY_T); }
245 0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
246 :
247 : SORT_STATIC SORT_KEY_T *
248 : SORT_(private_merge)( SORT_KEY_T * key,
249 : long cnt,
250 : SORT_KEY_T * tmp );
251 :
252 : SORT_STATIC SORT_KEY_T *
253 : SORT_(private_quick)( SORT_KEY_T * key,
254 : SORT_IDX_T cnt );
255 :
256 : SORT_STATIC SORT_KEY_T *
257 : SORT_(private_select)( SORT_KEY_T * key,
258 : SORT_IDX_T cnt,
259 : SORT_IDX_T rnk );
260 :
261 : SORT_STATIC SORT_KEY_T *
262 : SORT_(insert)( SORT_KEY_T * key,
263 : SORT_IDX_T cnt );
264 :
265 : static inline SORT_KEY_T *
266 : SORT_(stable_fast)( SORT_KEY_T * key,
267 : SORT_IDX_T cnt,
268 510576 : void * scratch ) {
269 510576 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
270 510240 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
271 510240 : return SORT_(private_merge)( key, (long)cnt, tmp );
272 510576 : }
273 :
274 : static inline SORT_KEY_T *
275 : SORT_(stable)( SORT_KEY_T * key,
276 : SORT_IDX_T cnt,
277 510576 : void * scratch ) {
278 510576 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
279 510240 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
280 510240 : if( SORT_(private_merge)( key, (long)cnt, tmp )==tmp ) /* 50/50 branch prob */
281 105270 : memcpy( key, tmp, sizeof(SORT_KEY_T)*(ulong)cnt );
282 510240 : return key;
283 510576 : }
284 :
285 : static inline SORT_KEY_T *
286 : SORT_(inplace)( SORT_KEY_T * key,
287 1998366 : SORT_IDX_T cnt ) {
288 1998366 : return SORT_(private_quick)( key, cnt );
289 1998366 : }
290 :
291 : static inline SORT_KEY_T *
292 : SORT_(select)( SORT_KEY_T * key,
293 : SORT_IDX_T cnt,
294 231492 : SORT_IDX_T rnk ) {
295 231492 : return SORT_(private_select)( key, cnt, rnk );
296 231492 : }
297 :
298 : static inline SORT_IDX_T
299 : SORT_(search_geq)( SORT_KEY_T const * sorted,
300 : SORT_IDX_T cnt,
301 6153 : SORT_KEY_T query ) {
302 6153 : SORT_IDX_T j = (SORT_IDX_T)0;
303 6153 : SORT_IDX_T n = (SORT_IDX_T)cnt;
304 67653 : while( n>((SORT_IDX_T)1) ) {
305 : /* At this point the j corresponding to k is in [j,j+n) */
306 61500 : SORT_IDX_T j_left = j; SORT_IDX_T n_left = n >> 1;
307 61500 : SORT_IDX_T j_right = j + n_left; SORT_IDX_T n_right = n - n_left;
308 61500 : int go_left = SORT_BEFORE( query, sorted[ j_right ] );
309 61500 : j = go_left ? j_left : j_right; /* branchless */
310 61500 : n = go_left ? n_left : n_right; /* branchless */
311 61500 : }
312 6153 : return j;
313 6153 : }
314 :
315 : #if SORT_PARALLEL
316 :
317 : SORT_STATIC FD_MAP_REDUCE_PROTO( SORT_(private_merge_para) );
318 : SORT_STATIC FD_FOR_ALL_PROTO ( SORT_(private_memcpy_para) );
319 :
320 : static inline SORT_KEY_T *
321 : SORT_(stable_fast_para)( fd_tpool_t * tpool,
322 : ulong t0,
323 : ulong t1,
324 : SORT_KEY_T * key,
325 : SORT_IDX_T cnt,
326 0 : void * scratch ) {
327 :
328 : /* The wallclock time of the below for N keys and T threads is
329 : roughly:
330 :
331 : alpha N + beta N (ln N - ln T) / T + gamma ln T
332 :
333 : where the first term represents the cost of the final sort rounds
334 : (which are parallelized over increasingly fewer threads than the
335 : initial rounds ... wallclock sums up to something proportional to N
336 : in the limit of large T), the second term represents the cost of
337 : the initial parallel rounds (which are parallelized over T threads)
338 : and the last term represents the wallclock to dispatch and
339 : synchronize the threads. Optimizing T for a given N yields:
340 :
341 : -beta N (ln N - ln T_opt) / T_opt^2 - beta N / T_opt^2 + gamma / T_opt = 0
342 :
343 : Solving for T_opt in the limit N >> T_opt yields:
344 :
345 : T_opt = (beta/gamma) N ln N
346 :
347 : where the ratio beta is roughly the merge pass marginal wallclock
348 : per key and gamma is proportional to the marginal wallclock to
349 : start and stop a thread. Since these are typically used over a
350 : domain of moderate N, we can approximate ln N by ln N_ref,
351 : yielding:
352 :
353 : T_opt ~ N / thresh
354 :
355 : where thresh ~ gamma / (beta ln N_ref). We use an empirical value
356 : such threads will have at least one normal page of work of keys to
357 : process in a block typically. */
358 :
359 0 : if( FD_UNLIKELY( cnt<2L ) ) return key;
360 :
361 0 : static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
362 0 : ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
363 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable_fast)( key, cnt, scratch );
364 0 : t1 = t0 + t_cnt;
365 :
366 0 : SORT_KEY_T * out[1];
367 0 : FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
368 0 : return out[0];
369 0 : }
370 :
371 : static inline SORT_KEY_T *
372 : SORT_(stable_para)( fd_tpool_t * tpool,
373 : ulong t0,
374 : ulong t1,
375 : SORT_KEY_T * key,
376 : SORT_IDX_T cnt,
377 0 : void * scratch ) {
378 :
379 : /* This works the same as the above but does a parallel memcpy if the
380 : result doesn't end up in the correct place. */
381 :
382 0 : if( FD_UNLIKELY( cnt<2L ) ) return key;
383 :
384 0 : static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
385 0 : ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
386 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable)( key, cnt, scratch );
387 0 : t1 = t0 + t_cnt;
388 :
389 0 : SORT_KEY_T * out[1];
390 0 : FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
391 0 : if( out[0]!=key ) FD_FOR_ALL( SORT_(private_memcpy_para), tpool,t0,t1, 0L,(long)cnt, key, scratch ); /* 50/50 branch prob */
392 0 : return key;
393 0 : }
394 :
395 : SORT_STATIC void
396 : SORT_(private_quick_node)( void * _tpool,
397 : ulong t0, ulong t1,
398 : void * _args,
399 : void * _reduce, ulong _stride,
400 : ulong _l0, ulong _l1,
401 : ulong _m0, ulong _m1,
402 : ulong _n0, ulong _n1 );
403 :
404 : static inline SORT_KEY_T *
405 : SORT_(inplace_para)( fd_tpool_t * tpool,
406 : ulong t0,
407 : ulong t1,
408 : SORT_KEY_T * key,
409 0 : SORT_IDX_T cnt ) {
410 0 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
411 0 : SORT_(private_quick_node)( tpool,t0,t1, (void *)key, (void *)(ulong)cnt, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
412 0 : return key;
413 0 : }
414 :
415 : #if FD_HAS_ALLOCA
416 :
417 : SORT_STATIC SORT_KEY_T *
418 : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
419 : SORT_KEY_T * key,
420 : SORT_IDX_T cnt,
421 : void * scratch,
422 : ulong seed,
423 : int stable );
424 :
425 : #endif /* FD_HAS_ALLOCA */
426 :
427 : #endif /* SORT_PARALLEL */
428 :
429 : FD_PROTOTYPES_END
430 :
431 : #endif
432 :
433 : #if SORT_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
434 :
435 : SORT_KEY_T *
436 : SORT_(insert)( SORT_KEY_T * key,
437 5147580 : SORT_IDX_T cnt ) {
438 118479144 : for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
439 113331564 : SORT_KEY_T key_i = key[i];
440 113331564 : SORT_IDX_T j = i;
441 935723698 : while( j ) {
442 931917198 : SORT_IDX_T k = j - ((SORT_IDX_T)1);
443 931917198 : SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
444 822392134 : key[j] = key_j;
445 822392134 : j = k;
446 822392134 : }
447 113331564 : key[j] = key_i;
448 113331564 : }
449 5147580 : return key;
450 5147580 : }
451 :
452 : static void
453 : SORT_(private_merge_pass)( SORT_KEY_T const * key_l, long cnt_l,
454 : SORT_KEY_T const * key_r, long cnt_r,
455 2659872 : SORT_KEY_T * key_m ) {
456 2659872 : long i = 0L;
457 2659872 : long j = 0L;
458 2659872 : long k = 0L;
459 2659872 : # if 1 /* Minimal C language operations */
460 144635166 : for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
461 144635166 : if( SORT_BEFORE( key_r[j], key_l[i] ) ) {
462 52344009 : key_m[k++] = key_r[j++];
463 52344009 : if( j>=cnt_r ) {
464 170760 : memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) ); /* append left stragglers (at least one) */
465 170760 : break;
466 170760 : }
467 92291157 : } else {
468 92291157 : key_m[k++] = key_l[i++];
469 92291157 : if( i>=cnt_l ) {
470 2489112 : memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) ); /* append right stragglers (at least one) */
471 2489112 : break;
472 2489112 : }
473 92291157 : }
474 144635166 : }
475 : # else /* Branch free variant (Pyth investigations suggested the above is better) */
476 : while( ((i<cnt_l) & (j<cnt_r)) ) {
477 : SORT_KEY_T ki = key_l[ i ];
478 : SORT_KEY_T kj = key_r[ j ];
479 : int use_left = !(SORT_BEFORE( kj, ki ));
480 : key_m[ k ] = use_left ? ki : kj; /* cmov ideally */
481 : i += (long) use_left;
482 : j += (long)!use_left;
483 : k++;
484 : }
485 : if( i<cnt_l ) memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) );
486 : else if( j<cnt_r ) memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) );
487 : # endif
488 2659872 : }
489 :
490 : SORT_KEY_T *
491 : SORT_(private_merge)( SORT_KEY_T * key,
492 : long cnt,
493 6340224 : SORT_KEY_T * tmp ) {
494 :
495 : /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
496 :
497 : /* If below threshold, use insertion sort */
498 :
499 6340224 : if( cnt<=((long)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, (SORT_IDX_T)cnt );
500 :
501 : /* Otherwise, break input in half and sort the halves */
502 :
503 2659872 : SORT_KEY_T * key_l = key;
504 2659872 : SORT_KEY_T * tmp_l = tmp;
505 2659872 : long cnt_l = cnt >> 1;
506 2659872 : SORT_KEY_T * in_l = SORT_(private_merge)( key_l, cnt_l, tmp_l );
507 :
508 2659872 : SORT_KEY_T * key_r = key + cnt_l;
509 2659872 : SORT_KEY_T * tmp_r = tmp + cnt_l;
510 2659872 : long cnt_r = cnt - cnt_l;
511 2659872 : SORT_KEY_T * in_r = SORT_(private_merge)( key_r, cnt_r, tmp_r );
512 :
513 : /* Merge in_l / in_r */
514 :
515 2659872 : SORT_KEY_T * out = (in_l==key) ? tmp : key; /* If in_l overlaps with key, merge into tmp. Otherwise, merge into key */
516 :
517 : /* Note that in_l does not overlap with out at this point. in_r might
518 : overlap with the right half of out but the merge pass is fine for
519 : that case. */
520 :
521 2659872 : SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
522 :
523 2659872 : return out;
524 6340224 : }
525 :
526 : /* This uses a dual pivot quick sort for better theoretical and
527 : practical mojo. */
528 :
529 : SORT_KEY_T *
530 : SORT_(private_quick)( SORT_KEY_T * key,
531 1998366 : SORT_IDX_T cnt ) {
532 1998366 : SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
533 1998366 : ulong stack_cnt = 0UL;
534 :
535 1998366 : stack[ stack_cnt++ ] = (SORT_IDX_T)0;
536 1998366 : stack[ stack_cnt++ ] = cnt;
537 6451488 : for(;;) {
538 :
539 : /* Pop the next partition to process */
540 :
541 6451488 : if( !stack_cnt ) return key; /* All done */
542 4453122 : SORT_IDX_T h = stack[ --stack_cnt ];
543 4453122 : SORT_IDX_T l = stack[ --stack_cnt ];
544 :
545 : /* [l,h) is the partition we are sorting. If this partition is
546 : small enough, sort it via insertion sort. */
547 :
548 4453122 : SORT_IDX_T n = h-l;
549 4453122 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
550 1195776 : SORT_(insert)( key+l, n );
551 1195776 : continue;
552 1195776 : }
553 :
554 : /* This partition is too large to insertion sort. Pick two pivots,
555 : sort the partition into a left, center and right partition and
556 : then recursively sort those partitions. We initially use a
557 : simple choice of pivots that is ideal for nearly sorted data (in
558 : either direction) with a little extra sampling to improve the
559 : partitioning for randomly ordered data. There is a possibility
560 : that this deterministic choice of pivots will not succeed in
561 : making two or more non-empty partitions; we detect and correct
562 : that below. */
563 :
564 : /* Initial pivot selection */
565 :
566 3257346 : SORT_KEY_T p1;
567 3257346 : SORT_KEY_T p2;
568 3257346 : do {
569 3257346 : SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
570 3257346 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
571 3257346 : SORT_KEY_T p0 = key[ l ];
572 3257346 : /**/ p1 = key[ l + n3 ];
573 3257346 : /**/ p2 = key[ h1 - n3 ];
574 3257346 : SORT_KEY_T p3 = key[ h1 ];
575 3257346 : # if SORT_QUICK_ORDER_STYLE==0
576 : /* Generates better code for integral types (branchless via stack) */
577 3257346 : SORT_KEY_T _k[2]; ulong _c;
578 16286730 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
579 : # else
580 : /* Generates much better code for floating point types (branchless
581 : via min / max floating point instructions) */
582 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
583 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
584 : # endif
585 :
586 3257346 : ORDER(p0,p2); ORDER(p1,p3);
587 3257346 : ORDER(p0,p1); ORDER(p2,p3);
588 3257346 : ORDER(p1,p2);
589 3257346 : # undef ORDER
590 3257346 : } while(0);
591 :
592 3645678 : retry: /* Three way partitioning (silly language restriction) */;
593 3645678 : SORT_IDX_T i = l;
594 3645678 : SORT_IDX_T j = l;
595 3645678 : SORT_IDX_T k = h-(SORT_IDX_T)1;
596 509786277 : do { /* Note [j,k] is non-empty (look ma ... no branches!) */
597 :
598 : /* At this point, p1 <= p2 and:
599 : - keys [l,i) are before p1 (left partition)
600 : - keys [i,j) are in [p1,p2] (center partition)
601 : - keys [j,k] are not yet partitioned (region is non-empty)
602 : - keys (k,h) are after p2 (right partition)
603 : Decide where key j should be moved. */
604 :
605 509786277 : SORT_KEY_T kj = key[j];
606 :
607 509786277 : int to_left = (SORT_BEFORE( kj, p1 ));
608 509786277 : int to_right = (SORT_BEFORE( p2, kj ));
609 509786277 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
610 :
611 : # if SORT_QUICK_SWAP_MINIMIZE
612 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
613 : # else
614 : /**/ key[j] = key[m]; key[m] = kj;
615 509786277 : # endif
616 :
617 509786277 : i += (SORT_IDX_T) to_left;
618 509786277 : j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
619 509786277 : k -= (SORT_IDX_T) to_right;
620 509786277 : } while( j<=k );
621 :
622 : /* Schedule recursion */
623 :
624 3645678 : if( FD_UNLIKELY( (j-i)==n ) ) {
625 :
626 : /* All the keys ended up in the center partition. If p1<p2, we
627 : had an unlucky choice of pivots such that p1==min(key[i]) and
628 : p2==max(key[i]) for i in [l,h). Since we picked the keys
629 : deterministically above, we would have the same bad luck the
630 : next time we processed this partition. But, because there are
631 : at least two distinct elements in the partition, there is a
632 : choice of pivots that will make a non-trivial partition, so we
633 : need to redo this partition with different pivots.
634 :
635 : Randomly picking a new pivot pair in [l,h) has probability
636 : between ~50% and ~100%. The worst case is half the keys in
637 : the partition equal to p1 and the other half equal to p2; the
638 : best case exactly one key is equal to p1 / p2 and all other
639 : keys are in (p1,p2] / [p1,p2).
640 :
641 : The optimal handling of the worst case though is just to set
642 : p1=p2 (p2=p1). This will pull the keys equal to p1 (p2) into
643 : the left (right) partition and the remaining keys into the
644 : center partition; the right (left) partition will be empty.
645 : Thus, this is guaranteed to yield a non-trivial partition of
646 : the center partition but may not yield as load balanced set of
647 : partitions as random for the next iteration. We go with the
648 : deterministic option for simplicity below. */
649 :
650 1550499 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
651 :
652 : /* p1==p2 here ... all keys in this partition are the same
653 : We don't need to recurse further in this partition. */
654 :
655 2095179 : } else {
656 :
657 : /* Between [1,n-1] keys ended up in the center partition such that
658 : at least 1 key ended up in another partition. We push all
659 : three partitions onto the partition stack for recursion. The
660 : order of the pushes is important to bound the maximum stack
661 : usage. Below, we push the center partition followed by the
662 : larger of the left and right partitions then followed by the
663 : smaller of the left and right partitions (such that we process
664 : the smallest next iteration). For this, the same O(log_2 cnt)
665 : max recursion depth applies as quicksort even this does a three
666 : way partitioning. That is, let fl/fc/fr be the fraction of
667 : keys in the left, center, right partitions. At this point, fl
668 : is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
669 : The smaller of the left or right partition is what process next
670 : and, as the smaller of the left or right, it fraction of the
671 : keys is at most 0.5(1-fc)<=0.5. We do push up to two
672 : partitions onto the stack (four SORT_IDX_T) each level of
673 : recursion though instead of one like normal quick sort though. */
674 :
675 2095179 : int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
676 2095179 : SORT_IDX_T l_larger = SORT_IDX_IF( left_larger, l, j );
677 2095179 : SORT_IDX_T h_larger = SORT_IDX_IF( left_larger, i, h );
678 2095179 : SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
679 2095179 : SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
680 :
681 : /* Immediately process empty partitions */
682 2095179 : ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
683 :
684 : /* Immediately process partitions where all keys are the same */
685 2095179 : ulong push_center = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
686 :
687 : /* Guaranteed non-empty (larger of left or right have at least 1 key) */
688 2095179 : ulong push_larger = 1UL;
689 :
690 2095179 : stack[ stack_cnt ] = l_larger; stack_cnt += push_larger;
691 2095179 : stack[ stack_cnt ] = h_larger; stack_cnt += push_larger;
692 2095179 : stack[ stack_cnt ] = i; stack_cnt += push_center;
693 2095179 : stack[ stack_cnt ] = j; stack_cnt += push_center;
694 2095179 : stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
695 2095179 : stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
696 2095179 : }
697 3645678 : }
698 : /* never get here */
699 1998366 : }
700 :
701 : /* This works identical to sort_private_quick. Only differences
702 : relevant to selection are commented (in short, we only recurse on the
703 : partition that could contain rank). See above for comments on the
704 : algo. */
705 :
706 : SORT_KEY_T *
707 : SORT_(private_select)( SORT_KEY_T * key,
708 : SORT_IDX_T cnt,
709 231492 : SORT_IDX_T rnk ) {
710 231492 : SORT_IDX_T l = (SORT_IDX_T)0;
711 231492 : SORT_IDX_T h = cnt;
712 748695 : for(;;) {
713 :
714 : /* The partition [l,h) contains rnk. If this partition is small
715 : enough, sort it via insertion sort. FIXME: probably could
716 : truncate the insertion sort early for further optimization (e.g.
717 : if rnk==l / rnk==h-1, we only need to find the min/max for O(n)
718 : operation). */
719 :
720 748695 : SORT_IDX_T n = h-l;
721 748695 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
722 :
723 517227 : SORT_KEY_T p1;
724 517227 : SORT_KEY_T p2;
725 517227 : do {
726 517227 : SORT_IDX_T n3 = n / (SORT_IDX_T)3;
727 517227 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
728 517227 : SORT_KEY_T p0 = key[ l ];
729 517227 : /**/ p1 = key[ l + n3 ];
730 517227 : /**/ p2 = key[ h1 - n3 ];
731 517227 : SORT_KEY_T p3 = key[ h1 ];
732 517227 : # if SORT_QUICK_ORDER_STYLE==0
733 517227 : SORT_KEY_T _k[2]; ulong _c;
734 2586135 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
735 : # else
736 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
737 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
738 : # endif
739 :
740 517227 : ORDER(p0,p2); ORDER(p1,p3);
741 517227 : ORDER(p0,p1); ORDER(p2,p3);
742 517227 : ORDER(p1,p2);
743 517227 : # undef ORDER
744 517227 : } while(0);
745 :
746 517234 : retry: /* Silly language restriction */;
747 517234 : SORT_IDX_T i = l;
748 517234 : SORT_IDX_T j = l;
749 517234 : SORT_IDX_T k = h-(SORT_IDX_T)1;
750 57778525 : do {
751 :
752 57778525 : SORT_KEY_T kj = key[j];
753 :
754 57778525 : int to_left = (SORT_BEFORE( kj, p1 ));
755 57778525 : int to_right = (SORT_BEFORE( p2, kj ));
756 57778525 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
757 :
758 : # if SORT_QUICK_SWAP_MINIMIZE
759 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
760 : # else
761 : /**/ key[j] = key[m]; key[m] = kj;
762 57778525 : # endif
763 :
764 57778525 : i += (SORT_IDX_T) to_left;
765 57778525 : j += (SORT_IDX_T)(to_right^1);
766 57778525 : k -= (SORT_IDX_T) to_right;
767 57778525 : } while( j<=k );
768 :
769 517234 : if( FD_UNLIKELY( (j-i)==n ) ) {
770 :
771 31 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
772 :
773 : /* p1==p2 here ... all keys in this partition are the same. We
774 : don't need to recurse further in this partition as all keys
775 : would do here. */
776 :
777 24 : return key;
778 31 : }
779 :
780 : /* At this point:
781 : - [l,i) is the left partition,
782 : - [i,j) is the center partition
783 : - [j,h) is the right partition
784 : Between [1,n-1] keys ended up in the center partition such that
785 : at least 1 key ended up in another partition. Recurse on the
786 : partition that contains rnk. As this is typically used in median
787 : finding, we assume that the center partition is the most likely
788 : in the below selection (this can also be done branchlessly). */
789 :
790 517203 : SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
791 517203 : if( FD_UNLIKELY( rnk< i ) ) l_next = l, h_next = i;
792 517203 : if( FD_UNLIKELY( j <=rnk ) ) l_next = j, h_next = h;
793 517203 : l = l_next; h = h_next;
794 517203 : }
795 : /* never get here */
796 231492 : }
797 :
798 : #if SORT_PARALLEL
799 :
800 : /* This works identical to sort_private_quick. Only differences
801 : relevant to parallelization are commented. See above for comments on
802 : the algo. */
803 :
804 : void
805 : SORT_(private_quick_node)( void * _tpool,
806 : ulong t0, ulong t1,
807 : void * _args,
808 : void * _reduce, ulong _stride,
809 : ulong _l0, ulong _l1,
810 : ulong _m0, ulong _m1,
811 0 : ulong _n0, ulong _n1 ) {
812 0 : (void)_stride; (void)_l0; (void)_l1; (void)_m0; (void)_m1; (void)_n0; (void)_n1;
813 :
814 0 : fd_tpool_t * tpool = (fd_tpool_t *) _tpool;
815 0 : SORT_KEY_T * key = (SORT_KEY_T *) _args;
816 0 : SORT_IDX_T cnt = (SORT_IDX_T)(ulong)_reduce;
817 :
818 0 : SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ];
819 0 : ulong stack_cnt = 0UL;
820 :
821 0 : ulong wait_stack[ 82 ]; /* See note below for sizing considerations */
822 0 : ulong wait_stack_cnt = 0UL;
823 :
824 0 : stack[ stack_cnt++ ] = (SORT_IDX_T)0;
825 0 : stack[ stack_cnt++ ] = cnt;
826 :
827 0 : while( stack_cnt ) {
828 :
829 0 : SORT_IDX_T h = stack[ --stack_cnt ];
830 0 : SORT_IDX_T l = stack[ --stack_cnt ];
831 :
832 0 : SORT_IDX_T n = h-l;
833 0 : if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
834 0 : SORT_(insert)( key+l, n );
835 0 : continue;
836 0 : }
837 :
838 0 : SORT_KEY_T p1;
839 0 : SORT_KEY_T p2;
840 0 : do {
841 0 : SORT_IDX_T n3 = n / (SORT_IDX_T)3;
842 0 : SORT_IDX_T h1 = h - (SORT_IDX_T)1;
843 0 : SORT_KEY_T p0 = key[ l ];
844 0 : /**/ p1 = key[ l + n3 ];
845 0 : /**/ p2 = key[ h1 - n3 ];
846 0 : SORT_KEY_T p3 = key[ h1 ];
847 0 : # if SORT_QUICK_ORDER_STYLE==0
848 0 : SORT_KEY_T _k[2]; ulong _c;
849 0 : # define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
850 : # else
851 : SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
852 : # define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
853 : # endif
854 :
855 0 : ORDER(p0,p2); ORDER(p1,p3);
856 0 : ORDER(p0,p1); ORDER(p2,p3);
857 0 : ORDER(p1,p2);
858 0 : # undef ORDER
859 0 : } while(0);
860 :
861 0 : retry: /* Silly language restriction */;
862 0 : SORT_IDX_T i = l;
863 0 : SORT_IDX_T j = l;
864 0 : SORT_IDX_T k = h-(SORT_IDX_T)1;
865 0 : do {
866 0 : SORT_KEY_T kj = key[j];
867 :
868 0 : int to_left = (SORT_BEFORE( kj, p1 ));
869 0 : int to_right = (SORT_BEFORE( p2, kj ));
870 0 : SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
871 :
872 : # if SORT_QUICK_SWAP_MINIMIZE
873 : if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
874 : # else
875 : /**/ key[j] = key[m]; key[m] = kj;
876 0 : # endif
877 :
878 0 : i += (SORT_IDX_T) to_left;
879 0 : j += (SORT_IDX_T)(to_right^1);
880 0 : k -= (SORT_IDX_T) to_right;
881 0 : } while( j<=k );
882 :
883 0 : if( FD_UNLIKELY( (j-i)==n ) ) {
884 :
885 0 : if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
886 :
887 0 : } else {
888 :
889 : /* At this point, we have at most 3 partitions to sort and can use
890 : the caller and tpool threads (t0,t1) to sort them. To load
891 : balance this, we sort the partitions by an estimate of how much
892 : work each requires. We estimate this as proportional to:
893 :
894 : work = cnt ceil log_3 cnt if cnt>0 and 0 otherwise
895 :
896 : where cnt is the number of keys in the partition. Since this
897 : is a rough estimate used for load balancing, we don't need a
898 : precise log_3 calculation. With log_3 x ~ 0.633 lg x, we
899 : approximate:
900 :
901 : ceil log_3 x ~ ceil( 5 (lg x) / 8 )
902 : ~ ceil( 5 (floor lg x) / 8)
903 : = floor( (5 (floor lg x) + 7)/8 )
904 : = (5 (msb x) + 7) >> 3
905 :
906 : This approximation is trivial to compute, is monotonic and is
907 : either exact or 1 less for x in (0,2^64). The approximation is
908 : such that cnt==1 / cnt>1 yields zero / positive. As such,
909 : work==0 strictly indicates no additional work is necessary. */
910 :
911 0 : # define WORK_APPROX(cnt) (((ulong)(cnt))*(ulong)((5*fd_ulong_find_msb_w_default( (ulong)(cnt), 0 )+7) >> 3))
912 :
913 0 : struct { SORT_IDX_T l; SORT_IDX_T h; ulong work; } part[3];
914 :
915 0 : part[0].l = l; part[0].h = i; part[0].work = WORK_APPROX( i-l );
916 0 : part[1].l = i; part[1].h = j; part[1].work = fd_ulong_if( SORT_BEFORE( p1, p2 ), WORK_APPROX( j-i ), 0UL );
917 0 : part[2].l = j; part[2].h = h; part[2].work = WORK_APPROX( h-j );
918 :
919 0 : # undef WORK_APPROX
920 :
921 0 : ulong work_remain = part[0].work + part[1].work + part[2].work;
922 :
923 0 : # define ORDER(i,j) fd_swap_if( part[j].work > part[i].work, part[i], part[j] )
924 :
925 0 : ORDER(0,2); ORDER(0,1); ORDER(1,2);
926 :
927 0 : # undef ORDER
928 :
929 0 : for( ulong idx=0UL; idx<3UL; idx++ ) {
930 :
931 : /* At this point, we need to schedule partitions [idx,2] requiring
932 : work_remain total estimated work and we have the caller and
933 : tpool threads (t0,t1) to do this work. Compute which threads
934 : should execute partition idx. */
935 :
936 0 : ulong p_work = part[idx].work;
937 0 : if( FD_UNLIKELY( !p_work ) ) break; /* no work need for this partition and remainin partitions */
938 :
939 0 : SORT_IDX_T p_l = part[idx].l;
940 0 : SORT_IDX_T p_h = part[idx].h;
941 :
942 0 : ulong t_cnt = t1 - t0;
943 0 : if( FD_UNLIKELY( (t_cnt>1UL) & (p_work<work_remain) ) ) {
944 :
945 : /* At this point, we have at least two partitions that need
946 : work to sort and at least two threads remaining for
947 : sorting. Schedule this partition for execution on between
948 : 1 and t_cnt-1 of the remaining threads roughly proportional
949 : to the estimated work to sort this partition versus the
950 : total estimated work remaining.
951 :
952 : For the wait stack, in the worst case, we schedule two
953 : partitions for remote execution covering two thirds of the
954 : keys each round. This implies the wait stack depth is
955 : bounded by:
956 :
957 : 2 ceil log_3 thread_cnt
958 :
959 : Practically, thread_cnt<=FD_TILE_MAX. For
960 : FD_TILE_MAX==1024, 14 is sufficient and 82 is sufficient
961 : for any thread_cnt representable by a ulong. */
962 :
963 0 : ulong ts = t1 - fd_ulong_min( fd_ulong_max( (t_cnt*p_work + (work_remain>>1)) / work_remain, 1UL ), t_cnt-1UL );
964 :
965 0 : fd_tpool_exec( tpool, ts, SORT_(private_quick_node), tpool, ts, t1, key + p_l, (void *)(ulong)(p_h - p_l),
966 0 : 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
967 :
968 0 : wait_stack[ wait_stack_cnt++ ] = ts;
969 :
970 : /* Advance t1 and work_remain for the next iteration */
971 :
972 0 : t1 = ts;
973 0 : work_remain -= p_work;
974 :
975 0 : } else {
976 :
977 : /* At this point, we have only one thread available and/or
978 : this is the only partition remaining that needs sorting.
979 : Schedule this partition for local execution. */
980 :
981 0 : stack[ stack_cnt++ ] = p_l;
982 0 : stack[ stack_cnt++ ] = p_h;
983 :
984 0 : }
985 0 : }
986 0 : }
987 0 : }
988 :
989 : /* Wait for any children to finish */
990 :
991 0 : while( wait_stack_cnt ) fd_tpool_wait( tpool, wait_stack[ --wait_stack_cnt ] );
992 0 : }
993 :
994 0 : FD_MAP_REDUCE_BEGIN( SORT_(private_merge_para), 1L, 0UL, sizeof(SORT_KEY_T *), 1L ) {
995 :
996 0 : SORT_KEY_T ** out = (SORT_KEY_T **)arg[0];
997 0 : SORT_KEY_T * key = (SORT_KEY_T * )arg[1];
998 0 : SORT_KEY_T * tmp = (SORT_KEY_T * )arg[2];
999 :
1000 0 : *out = SORT_(stable_fast)( key + block_i0, (SORT_IDX_T)block_cnt, tmp + block_i0 );
1001 :
1002 0 : } FD_MAP_END {
1003 :
1004 0 : SORT_KEY_T * key = (SORT_KEY_T * )arg[1];
1005 0 : SORT_KEY_T * tmp = (SORT_KEY_T * )arg[2];
1006 0 : SORT_KEY_T ** _in_l = (SORT_KEY_T **)arg[0]; SORT_KEY_T * in_l = *_in_l; long cnt_l = block_is - block_i0;
1007 0 : SORT_KEY_T ** _in_r = (SORT_KEY_T **)_r1; SORT_KEY_T * in_r = *_in_r; long cnt_r = block_i1 - block_is;
1008 :
1009 : /* Merge in_l / in_r (see private_merge above for details) */
1010 :
1011 0 : SORT_KEY_T * out = ((in_l==(key+block_i0)) ? tmp : key) + block_i0; /* cmov */
1012 :
1013 0 : SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
1014 :
1015 0 : *_in_l = out;
1016 :
1017 0 : } FD_REDUCE_END
1018 :
1019 0 : FD_FOR_ALL_BEGIN( SORT_(private_memcpy_para), 1L ) {
1020 :
1021 0 : SORT_KEY_T * dst = (SORT_KEY_T *)arg[0];
1022 0 : SORT_KEY_T const * src = (SORT_KEY_T const *)arg[1];
1023 :
1024 0 : if( FD_LIKELY( block_cnt ) ) memcpy( dst + block_i0, src + block_i0, sizeof(SORT_KEY_T)*(ulong)block_cnt );
1025 :
1026 0 : } FD_FOR_ALL_END
1027 :
1028 : #if FD_HAS_ALLOCA
1029 :
1030 0 : static FD_FOR_ALL_BEGIN( SORT_(private_cntcpy_para), 1L ) {
1031 0 : ulong tpool_base = (ulong )arg[0];
1032 0 : ulong t_cnt = (ulong )arg[1];
1033 0 : ulong * _key_cnt = (ulong *)arg[2];
1034 0 : SORT_KEY_T const * key = (SORT_KEY_T const *)arg[3];
1035 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)arg[4];
1036 0 : SORT_KEY_T const * pivot = (SORT_KEY_T const *)arg[5];
1037 :
1038 : /* Note keys in [ pivot[t-1], pivot[t] ) for t in [0,t_cnt) are keys
1039 : assigned to thread t for sorting. pivot[-1] / pivot[t_cnt-1] are
1040 : an implied -infinity / +infinity and never explicitly accessed. As
1041 : such, pivot is indexed [0,t_cnt-1). */
1042 :
1043 : /* Allocate and clear a local scratch for counting. We don't count
1044 : directly into key_cnt to avoid false sharing while counting. */
1045 :
1046 0 : ulong * key_cnt = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
1047 :
1048 0 : memset( key_cnt, 0, sizeof(ulong)*t_cnt );
1049 :
1050 0 : for( long i=block_i0; i<block_i1; i++ ) {
1051 0 : SORT_KEY_T ki = key[i];
1052 :
1053 : /* Determine which thread is responsible for subsorting this key */
1054 : /* FIXME: ideally, use a unrolled fixed count iteration for here */
1055 :
1056 0 : ulong l = 0UL;
1057 0 : ulong h = t_cnt;
1058 0 : for(;;) {
1059 0 : ulong n = h - l;
1060 0 : if( n<2UL ) break;
1061 :
1062 : /* At this point, the thread for key i is in [l,h) and this range
1063 : contains at least two threads. Split this range in half. */
1064 :
1065 0 : ulong m = l + (n>>1); /* In [1,t_cnt) */
1066 0 : int c = SORT_BEFORE( ki, pivot[m-1UL] );
1067 :
1068 : /* If ki is before pivot[m], the target thread is in [l,m).
1069 : Otherwise, the target thread is in [m,h). */
1070 :
1071 0 : l = fd_ulong_if( c, l, m );
1072 0 : h = fd_ulong_if( c, m, h );
1073 0 : }
1074 :
1075 : /* At this point, key[i] should be assigned to thread l. Count it
1076 : and copy it into tmp to prepare to scatter. */
1077 :
1078 0 : key_cnt[l]++;
1079 0 : tmp[i] = ki;
1080 0 : }
1081 :
1082 : /* Send the counts back to main thread for partitioning */
1083 :
1084 0 : memcpy( _key_cnt + (tpool_t0-tpool_base)*t_cnt, key_cnt, sizeof(ulong)*t_cnt );
1085 :
1086 0 : } FD_FOR_ALL_END
1087 :
1088 0 : static FD_FOR_ALL_BEGIN( SORT_(private_scatter_para), 1L ) {
1089 0 : ulong tpool_base = (ulong )arg[0];
1090 0 : ulong t_cnt = (ulong )arg[1];
1091 0 : ulong const * _part = (ulong const *)arg[2];
1092 0 : SORT_KEY_T * key = (SORT_KEY_T *)arg[3];
1093 0 : SORT_KEY_T const * tmp = (SORT_KEY_T const *)arg[4];
1094 0 : SORT_KEY_T const * pivot = (SORT_KEY_T const *)arg[5];
1095 :
1096 : /* Receive the key array partitioning from the main thread. Like the
1097 : above, we don't operate on the part array directly to avoid false
1098 : sharing while scattering. */
1099 :
1100 0 : ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
1101 :
1102 0 : memcpy( part, _part + (tpool_t0-tpool_base)*t_cnt, sizeof(ulong)*t_cnt );
1103 :
1104 0 : for( long i=block_i0; i<block_i1; i++ ) {
1105 0 : SORT_KEY_T ki = tmp[i];
1106 :
1107 : /* Determine which thread is responsible for subsort this key
1108 : (again). Identical considerations to the above. Note: we can
1109 : eliminate this computation if willing to burn O(N) additional
1110 : scratch memory by saving results computed above for use here. */
1111 :
1112 0 : ulong l = 0UL;
1113 0 : ulong h = t_cnt;
1114 0 : for(;;) {
1115 0 : ulong n = h - l;
1116 0 : if( n<2UL ) break;
1117 0 : ulong m = l + (n>>1);
1118 0 : int c = SORT_BEFORE( ki, pivot[m-1UL] );
1119 0 : l = fd_ulong_if( c, l, m );
1120 0 : h = fd_ulong_if( c, m, h );
1121 0 : }
1122 :
1123 : /* Send this key to target thread */
1124 :
1125 0 : key[ part[l]++ ] = ki;
1126 0 : }
1127 :
1128 0 : } FD_FOR_ALL_END
1129 :
1130 0 : static FD_FOR_ALL_BEGIN( SORT_(private_subsort_para), 1L ) {
1131 0 : ulong const * part = (ulong const *)arg[0];
1132 0 : SORT_KEY_T * key = (SORT_KEY_T *)arg[1];
1133 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)arg[2];
1134 0 : int stable = (int )arg[3];
1135 :
1136 0 : ulong j0 = part[ block_i0 ];
1137 0 : ulong j1 = part[ block_i1 ];
1138 :
1139 0 : if( stable ) SORT_(stable) ( key + j0, j1 - j0, tmp + j0 );
1140 0 : else SORT_(inplace)( key + j0, j1 - j0 );
1141 :
1142 0 : } FD_FOR_ALL_END
1143 :
1144 : SORT_KEY_T *
1145 : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
1146 : SORT_KEY_T * key,
1147 : SORT_IDX_T cnt,
1148 : void * scratch,
1149 : ulong seed,
1150 0 : int stable ) {
1151 :
1152 0 : if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key; /* nothing to do */
1153 :
1154 : /* For the below (if sampling ops are fully threaded), sampling costs
1155 : O(S), the sample sort costs O(S lg(T S)), the downsampling costs
1156 : O(1), the partitioning costs O(T), the counting and scattering cost
1157 : O(N (lg T) / T), the subsorts cost (N/T) lg(N/T) and thread
1158 : synchronization overhead is O(lg T).
1159 :
1160 : Combining all these, assuming the sampling ratio S is a fixed O(1)
1161 : quantity and assuming N (lg T) / T term in the counting/scattering
1162 : approximately cancals the -N lg T / T in the subsorts yields a
1163 : wallclock that scales as:
1164 :
1165 : alpha N (ln N) / T + beta ln T + gamma T
1166 :
1167 : The first term represents the parallelized sorting cost, the second
1168 : term represents the thread dispatch / sync cost, and the last term
1169 : represents the partitioning costs. Minimizing cost with respect
1170 : to T yields:
1171 :
1172 : -alpha (N ln N) / T_opt^2 + beta / T_opt + gamma = 0
1173 :
1174 : whose (positive) solution is:
1175 :
1176 : T_opt ~ (0.5 beta / gamma) [ sqrt( 1 + (4 alpha gamma N ln N)/beta^2) ) - 1 ]
1177 :
1178 : In the limit 4 alpha gamma N ln N / beta^2 >> 1 (that is,
1179 : partitioning costs are much lower than thread start/stop costs), we
1180 : have:
1181 :
1182 : T_opt -> (alpha N ln N) / beta
1183 :
1184 : In the other limit, we have:
1185 :
1186 : T_opt -> sqrt( alpha N ln N / gamma )
1187 :
1188 : The first limit tends to apply in practice. Thus we use:
1189 :
1190 : T_opt ~ (N lg N) / thresh
1191 : ~ floor( (N msb N + N/2 + thresh/2) / thresh)
1192 :
1193 : where thresh is an empirical minimum amount of sorting work to
1194 : justify starting / stopping a thread. (As written below, the gamma
1195 : term is more like gamma T^2, which makes the other limit more like
1196 : the cube root of N ln N but doesn't change the the overall
1197 : conclusion.) */
1198 :
1199 0 : ulong thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T);
1200 0 : ulong t_cnt = fd_ulong_min( t1 - t0, (cnt*(ulong)fd_ulong_find_msb( cnt ) + ((cnt+thresh)>>1)) / thresh );
1201 0 : if( FD_UNLIKELY( t_cnt<2UL ) ) return stable ? SORT_(stable)( key, cnt, scratch ) : SORT_(inplace)( key, cnt );
1202 0 : t1 = t0 + t_cnt;
1203 :
1204 : /* At this point, we have at least 2 threads available and at least 2
1205 : items to sort. Sample the keys to get some idea of their
1206 : distribution, sort the samples and downsample the sorted samples
1207 : into pivots that approximately uniformly partition the samples into
1208 : t_cnt groups (and thus approximately partition the keys uniformly
1209 : too). Notes:
1210 :
1211 : - The sampling below is practically uniform IID but it could be
1212 : made more robust with a rejection method (ala fd_rng_ulong_roll).
1213 :
1214 : - Increasing SORT_OVERSAMPLE_RATIO improves the uniformity of the
1215 : partitioning of key space over the inputs but requires more stack
1216 : scratch memory and more overhead.
1217 :
1218 : - All three steps here could be parallelized but it is probably not
1219 : worth it given the overhead for threaded versus the number of
1220 : samples.
1221 :
1222 : - For a stable sort, the result is completely deterministic even
1223 : if the seed provided is non-deterministic. */
1224 :
1225 0 : ulong sample_cnt = t_cnt*(ulong)SORT_OVERSAMPLE_RATIO;
1226 0 : SORT_KEY_T * pivot = fd_alloca( alignof(ulong), sizeof(ulong)*sample_cnt );
1227 :
1228 0 : for( ulong i=0UL; i<sample_cnt; i++ ) pivot[i] = key[ fd_ulong_hash( seed ^ i ) % cnt ];
1229 :
1230 0 : SORT_(inplace)( pivot, sample_cnt );
1231 :
1232 0 : for( ulong i=1UL; i<t_cnt; i++ ) pivot[i-1UL] = pivot[ i*(ulong)SORT_OVERSAMPLE_RATIO ];
1233 :
1234 : /* At this point, keys in [ pivot[t-1], pivot[t] ) should be assigned
1235 : to thread t for thread t's subsort. pivot[-1] / pivot[t_cnt-1] are
1236 : an implicit -infinity / +infinity. Split the input array equally
1237 : over threads and have each thread copy their block of keys into tmp
1238 : and count the number of keys in its block that could be assigned to
1239 : each thread. */
1240 :
1241 0 : # define part(i,j) part[ (i) + t_cnt*(j) ]
1242 :
1243 0 : ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt*t_cnt );
1244 :
1245 0 : SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
1246 :
1247 0 : FD_FOR_ALL( SORT_(private_cntcpy_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
1248 :
1249 : /* At this point, part(i,j) is the count of the number of keys in thread
1250 : j's block that were assigned to thread i's subsort. Convert this
1251 : into a partitioning. In-principle this can be parallelized but
1252 : this is rarely worth it practically. */
1253 :
1254 0 : ulong k = 0UL;
1255 0 : for( ulong i=0UL; i<t_cnt; i++ ) {
1256 0 : for( ulong j=0UL; j<t_cnt; j++ ) {
1257 0 : ulong c_ij = part(i,j);
1258 0 : part(i,j) = k;
1259 0 : k += c_ij;
1260 0 : }
1261 0 : }
1262 :
1263 : /* At this point, the range [ part(i,j), part(i+1,j) ) is where keys
1264 : in thread j's block assigned to thread i's subsort should be
1265 : scattered. part(t_cnt,t_cnt-1) is an implied cnt. Scatter the
1266 : keys from tmp back into key in subsorted order. */
1267 :
1268 0 : FD_FOR_ALL( SORT_(private_scatter_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
1269 :
1270 : /* At this point, keys [ part(i,0), part(i+1,0) ) are the keys assigned
1271 : to thread i for subsorting. part(t_cnt,0) is an implied cnt.
1272 : Since t_cnt>1 though this location exists. We make that explicit
1273 : such that part[i],part[i+1] give the sets of keys each thread
1274 : should sort. Do the thread parallel subsorts to get the keys into
1275 : final sorted order. */
1276 :
1277 0 : part[ t_cnt ] = cnt;
1278 :
1279 0 : FD_FOR_ALL( SORT_(private_subsort_para), tpool,t0,t1, 0L,(long)t_cnt, part, key, tmp, stable );
1280 :
1281 0 : # undef part
1282 :
1283 0 : return key;
1284 0 : }
1285 :
1286 : #endif /* FD_HAS_ALLOCA */
1287 : #endif /* SORT_PARALLEL */
1288 : #endif /* SORT_IMPL_STYLE!=1 */
1289 :
1290 : #undef SORT_
1291 : #undef SORT_STATIC
1292 :
1293 : #undef SORT_IMPL_STYLE
1294 : #undef SORT_IDX_IF
1295 : #undef SORT_OVERSAMPLE_RATIO
1296 : #undef SORT_PARALLEL
1297 : #undef SORT_QUICK_SWAP_MINIMIZE
1298 : #undef SORT_QUICK_ORDER_STYLE
1299 : #undef SORT_QUICK_THRESH
1300 : #undef SORT_MERGE_THRESH
1301 : #undef SORT_BEFORE
1302 : #undef SORT_IDX_T
1303 : #undef SORT_KEY_T
1304 : #undef SORT_NAME
|