LCOV - code coverage report
Current view: top level - util/tmpl - fd_sort.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 196 425 46.1 %
Date: 2025-07-01 05:00:49 Functions: 49 940 5.2 %

          Line data    Source code
       1             : /* Declares a family of functions useful for single threaded sorting of
       2             :    POD types in high performance contexts.  Example usage:
       3             : 
       4             :      #define SORT_NAME        sort_double_descend
       5             :      #define SORT_KEY_T       double
       6             :      #define SORT_BEFORE(a,b) (a)>(b)
       7             :      #include "util/tmpl/fd_sort.c"
       8             : 
       9             :    will create the following API for use in the local compile unit:
      10             : 
      11             :      // Sort key[i] for i in [0,cnt) stable in place in a best / average
      12             :      // worst case of O(N) / O(N^2) / O(N^2) operations.
      13             : 
      14             :      double *
      15             :      sort_double_descend_insert( double * key,
      16             :                                  ulong    cnt );
      17             : 
      18             :      // Return 1 if cnt is a sensible value for the sorting APIs
      19             :      // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow,
      20             :      // 3*cnt*ceil(log_3(cnt)) will not overflow, etc).
      21             : 
      22             :      int sort_double_descend_cnt_valid( ulong cnt );
      23             : 
      24             :      // Return the alignment and footprint required for a scratch region
      25             :      // adequate for sorting up to cnt elements.  The results will be
      26             :      // standard allocator and standard declaration friendly.  (E.g. a
      27             :      // declaration "double scratch[ cnt ];" will be fine as a scratch
      28             :      // region.)
      29             : 
      30             :      ulong sort_double_descend_stable_scratch_align    ( void      );
      31             :      ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
      32             : 
      33             :      // Sort key[i] for i in [0,cnt) stable in best / average / worst
      34             :      // case of O(N lg N) / O(N lg N) / O(N lg N) operations.  Scratch
      35             :      // is a scratch workspace of suitable alignment and footprint.
      36             :      // Returns where the sorted values ended up.  Will be at either key
      37             :      // or (double *)scratch.
      38             : 
      39             :      double *
      40             :      sort_double_descend_stable_fast( double * key,
      41             :                                       ulong    cnt,
      42             :                                       void   * scratch );
      43             : 
      44             :      // Same as above but does additional copying if necessary such that
      45             :      // the final result ends up in key.  Returns key.
      46             : 
      47             :      double *
      48             :      sort_double_descend_stable( double * key,
      49             :                                  ulong    cnt,
      50             :                                  void *   scratch );
      51             : 
      52             :      // Sort key[i] for i in [0,cnt) inplace in best / average / worst
      53             :      // case of O(N lg N) / O(N lg N) / O(N^2) operations.  Returns key.
      54             : 
      55             :      double *
      56             :      sort_double_descend_inplace( double * key,
      57             :                                   ulong    cnt );
      58             : 
      59             :      // Partially sort key[i] such that key[rnk] holds rnk and return
      60             :      // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
      61             :      // operations.  Returns key.  Assumes key is non-NULL, cnt is valid
      62             :      // and positive and rnk is in [0,cnt)
      63             : 
      64             :      double *
      65             :      sort_double_descend_select( double * key,
      66             :                                  ulong    cnt,
      67             :                                  ulong    rnk );
      68             : 
      69             :      // Binary search for element greater or equal to key in a sorted
      70             :      // list.  Return value is an index into sorted in [0,cnt].  If
      71             :      // return value > 0 then it is the largest index where
      72             :      // SORT_BEFORE(sorted[index-1],query)==1.   Returns 0 if
      73             :      // SORT_BEFORE(query,elem)==1 for all elements in sorted.  Assumes
      74             :      // SORT_BEFORE defines a total order over sorted, and each element
      75             :      // in sorted is less-or-equal than its successor.  (Returns
      76             :      // incorrect result otherwise!)   Robust if cnt==0 (in that case,
      77             :      // returns 0 and ignores pointer to sorted).
      78             : 
      79             :      ulong
      80             :      sort_double_descend_search_geq( double const * sorted,
      81             :                                      ulong          cnt,
      82             :                                      double         query );
      83             : 
      84             :      // sort_double_descend_{stable_fast,stable,inplace}_para is the
      85             :      // same as // above is adaptively parallelized over the caller
      86             :      // (typically tpool thread t0) and tpool threads (t0,t1).  Assumes
      87             :      // tpool is valid, tpool threads (t0,t1) are idle (or soon to be
      88             :      // idle).  These are only available if SORT_PARALLEL was requested.
      89             : 
      90             :      double *
      91             :      sort_double_descend_stable_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
      92             :                                            double * key,
      93             :                                            ulong    cnt,
      94             :                                            void *   scratch );
      95             : 
      96             :      double *
      97             :      sort_double_descend_stable_para( fd_tpool_t * tpool, ulong t0, ulong t1,
      98             :                                       double * key,
      99             :                                       ulong    cnt,
     100             :                                       void *   scratch );
     101             : 
     102             :      double *
     103             :      sort_double_descend_inplace_para( fd_tpool_t * tpool, ulong t0, ulong t1,
     104             :                                        double * key,
     105             :                                        ulong    cnt );
     106             : 
     107             :      // sort_double_descend_fast_para has better asymptotically
     108             :      // parallelization on average (~(N lg N)/T + lg T) but requires
     109             :      // more stack space on the caller (T^2).  seed gives a random seed
     110             :      // to use (e.g. fd_tickcount()).  If stable is non-zero, a stable
     111             :      // method will be used (and the resulting order will always be
     112             :      // deterministic).  Otherwise a faster unstable method will be used
     113             :      // (the order in which equal keys end up in the array will depend
     114             :      // on seed).  Assumes tpool is valid and  tpool threads (t0,t1) are
     115             :      // idle (or soon to be idle).  Only available if SORT_PARALLEL was
     116             :      // requested and the target has FD_HAS_ALLOCA.
     117             : 
     118             :      double *
     119             :      sort_double_descend_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
     120             :                                     double * key,
     121             :                                     ulong    cnt,
     122             :                                     void *   scratch,
     123             :                                     ulong    seed,
     124             :                                     int      stable );
     125             : 
     126             :    It is fine to include this template multiple times in a compilation
     127             :    unit.  Just provide the specification before each inclusion.  Various
     128             :    additional options to tune the methods are described below. */
     129             : 
     130             : /* SORT_NAME gives the name of the function to declare (and the base
     131             :    name of auxiliary and/or variant functions). */
     132             : 
     133             : #ifndef SORT_NAME
     134             : #error "SORT_NAME must be defined"
     135             : #endif
     136             : 
     137             : /* SORT_KEY_T gives the POD datatype to sort. */
     138             : 
     139             : #ifndef SORT_KEY_T
     140             : #error "SORT_KEY_T must be defined"
     141             : #endif
     142             : 
     143             : /* SORT_IDX_T gives the data type used to index the arrays */
     144             : 
     145             : #ifndef SORT_IDX_T
     146  1664241865 : #define SORT_IDX_T ulong
     147             : #endif
     148             : 
     149             : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true.  SAFETY TIP:
     150             :    This is not a 3-way comparison function! */
     151             : 
     152             : #ifndef SORT_BEFORE
     153     9137072 : #define SORT_BEFORE(a,b) (a)<(b)
     154             : #endif
     155             : 
     156             : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
     157             :    partition size where insertion sort will be used.  Should be at least
     158             :    2 (and probably larger than one might expect to get optimal
     159             :    performance). */
     160             : 
     161             : #ifndef SORT_MERGE_THRESH
     162     6340224 : #define SORT_MERGE_THRESH 32
     163             : #endif
     164             : 
     165             : #ifndef SORT_QUICK_THRESH
     166             : #define SORT_QUICK_THRESH 32
     167             : #endif
     168             : 
     169             : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
     170             :    Roughly, say 1 for floating point keys (see quick method for
     171             :    details). */
     172             : 
     173             : #ifndef SORT_QUICK_ORDER_STYLE
     174             : #define SORT_QUICK_ORDER_STYLE 0
     175             : #endif
     176             : 
     177             : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
     178             :    non-deterministic branch cost to avoid no-op swaps.  This is useful
     179             :    if the key_t has a large memory footprint such that the no-op is
     180             :    actually a larger cost than a branch mispredict. */
     181             : 
     182             : #ifndef SORT_QUICK_SWAP_MINIMIZE
     183             : #define SORT_QUICK_SWAP_MINIMIZE 0
     184             : #endif
     185             : 
     186             : /* SORT_PARALLEL will generate thread parallel versions of many sorts.
     187             :    Requires tpool. */
     188             : 
     189             : #ifndef SORT_PARALLEL
     190             : #define SORT_PARALLEL 0
     191             : #endif
     192             : 
     193             : /* SORT_OVERSAMPLE_RATIO indicates the amount of oversampling fast
     194             :    parallel sorts should use to fine pivots.  Only relevant if
     195             :    SORT_PARALLEL is true and FD_HAS_ALLOCA. */
     196             : 
     197             : #ifndef SORT_OVERSAMPLE_RATIO
     198           0 : #define SORT_OVERSAMPLE_RATIO 5
     199             : #endif
     200             : 
     201             : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
     202             : 
     203             : #ifndef SORT_IDX_IF
     204   575945518 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
     205             : #endif
     206             : 
     207             : /* 0 - local use only
     208             :    1 - library header declaration
     209             :    2 - library implementation */
     210             : 
     211             : #ifndef SORT_IMPL_STYLE
     212             : #define SORT_IMPL_STYLE 0
     213             : #endif
     214             : 
     215             : /* Implementation *****************************************************/
     216             : 
     217             : #if SORT_IMPL_STYLE==0 /* local use only */
     218             : #define SORT_STATIC FD_FN_UNUSED static
     219             : #else /* library header and/or implementation */
     220             : #define SORT_STATIC
     221             : #endif
     222             : 
     223    16337550 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
     224             : 
     225             : #if SORT_IMPL_STYLE!=2 /* need header */
     226             : 
     227             : #if SORT_PARALLEL
     228             : #include "../tpool/fd_tpool.h"
     229             : #else
     230             : #include "../bits/fd_bits.h"
     231             : #endif
     232             : 
     233             : FD_PROTOTYPES_BEGIN
     234             : 
     235             : FD_FN_CONST static inline int
     236           0 : SORT_(cnt_valid)( SORT_IDX_T cnt ) {
     237           0 :   /* Written funny for supporting different signed idx types without
     238           0 :      generating compiler warnings. */
     239           0 :   SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
     240           0 :                                              (ulong)(SORT_IDX_T)((1UL<<57)-1UL) );      /* ==SORT_IDX_MAX as ulong */
     241           0 :   return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
     242           0 : }
     243             : 
     244           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align)    ( void )           { return alignof(SORT_KEY_T); }
     245           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
     246             : 
     247             : SORT_STATIC SORT_KEY_T *
     248             : SORT_(private_merge)( SORT_KEY_T * key,
     249             :                       long         cnt,
     250             :                       SORT_KEY_T * tmp );
     251             : 
     252             : SORT_STATIC SORT_KEY_T *
     253             : SORT_(private_quick)( SORT_KEY_T * key,
     254             :                       SORT_IDX_T   cnt );
     255             : 
     256             : SORT_STATIC SORT_KEY_T *
     257             : SORT_(private_select)( SORT_KEY_T * key,
     258             :                        SORT_IDX_T   cnt,
     259             :                        SORT_IDX_T   rnk );
     260             : 
     261             : SORT_STATIC SORT_KEY_T *
     262             : SORT_(insert)( SORT_KEY_T * key,
     263             :                SORT_IDX_T   cnt );
     264             : 
     265             : static inline SORT_KEY_T *
     266             : SORT_(stable_fast)( SORT_KEY_T * key,
     267             :                     SORT_IDX_T   cnt,
     268      510576 :                     void *       scratch ) {
     269      510576 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     270      510240 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     271      510240 :   return SORT_(private_merge)( key, (long)cnt, tmp );
     272      510576 : }
     273             : 
     274             : static inline SORT_KEY_T *
     275             : SORT_(stable)( SORT_KEY_T * key,
     276             :                SORT_IDX_T   cnt,
     277      510576 :                void *       scratch ) {
     278      510576 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     279      510240 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     280      510240 :   if( SORT_(private_merge)( key, (long)cnt, tmp )==tmp ) /* 50/50 branch prob */
     281      105270 :     memcpy( key, tmp, sizeof(SORT_KEY_T)*(ulong)cnt );
     282      510240 :   return key;
     283      510576 : }
     284             : 
     285             : static inline SORT_KEY_T *
     286             : SORT_(inplace)( SORT_KEY_T * key,
     287     1998366 :                 SORT_IDX_T   cnt ) {
     288     1998366 :   return SORT_(private_quick)( key, cnt );
     289     1998366 : }
     290             : 
     291             : static inline SORT_KEY_T *
     292             : SORT_(select)( SORT_KEY_T * key,
     293             :                SORT_IDX_T   cnt,
     294      231492 :                SORT_IDX_T   rnk ) {
     295      231492 :   return SORT_(private_select)( key, cnt, rnk );
     296      231492 : }
     297             : 
     298             : static inline SORT_IDX_T
     299             : SORT_(search_geq)( SORT_KEY_T const * sorted,
     300             :                    SORT_IDX_T         cnt,
     301        6153 :                    SORT_KEY_T         query ) {
     302        6153 :   SORT_IDX_T j = (SORT_IDX_T)0;
     303        6153 :   SORT_IDX_T n = (SORT_IDX_T)cnt;
     304       67653 :   while( n>((SORT_IDX_T)1) ) {
     305             :     /* At this point the j corresponding to k is in [j,j+n) */
     306       61500 :     SORT_IDX_T j_left  = j;          SORT_IDX_T n_left  = n >> 1;
     307       61500 :     SORT_IDX_T j_right = j + n_left; SORT_IDX_T n_right = n - n_left;
     308       61500 :     int  go_left = SORT_BEFORE( query, sorted[ j_right ] );
     309       61500 :     j = go_left ? j_left : j_right; /* branchless */
     310       61500 :     n = go_left ? n_left : n_right; /* branchless */
     311       61500 :   }
     312        6153 :   return j;
     313        6153 : }
     314             : 
     315             : #if SORT_PARALLEL
     316             : 
     317             : SORT_STATIC FD_MAP_REDUCE_PROTO( SORT_(private_merge_para)   );
     318             : SORT_STATIC FD_FOR_ALL_PROTO   ( SORT_(private_memcpy_para)  );
     319             : 
     320             : static inline SORT_KEY_T *
     321             : SORT_(stable_fast_para)( fd_tpool_t * tpool,
     322             :                          ulong        t0,
     323             :                          ulong        t1,
     324             :                          SORT_KEY_T * key,
     325             :                          SORT_IDX_T   cnt,
     326           0 :                          void *       scratch ) {
     327             : 
     328             :   /* The wallclock time of the below for N keys and T threads is
     329             :      roughly:
     330             : 
     331             :        alpha N + beta N (ln N - ln T) / T + gamma ln T
     332             : 
     333             :      where the first term represents the cost of the final sort rounds
     334             :      (which are parallelized over increasingly fewer threads than the
     335             :      initial rounds ... wallclock sums up to something proportional to N
     336             :      in the limit of large T), the second term represents the cost of
     337             :      the initial parallel rounds (which are parallelized over T threads)
     338             :      and the last term represents the wallclock to dispatch and
     339             :      synchronize the threads.  Optimizing T for a given N yields:
     340             : 
     341             :        -beta N (ln N - ln T_opt) / T_opt^2 - beta N / T_opt^2 + gamma / T_opt = 0
     342             : 
     343             :      Solving for T_opt in the limit N >> T_opt yields:
     344             : 
     345             :        T_opt = (beta/gamma) N ln N
     346             : 
     347             :      where the ratio beta is roughly the merge pass marginal wallclock
     348             :      per key and gamma is proportional to the marginal wallclock to
     349             :      start and stop a thread.  Since these are typically used over a
     350             :      domain of moderate N, we can approximate ln N by ln N_ref,
     351             :      yielding:
     352             : 
     353             :        T_opt ~ N / thresh
     354             : 
     355             :     where thresh ~ gamma / (beta ln N_ref).  We use an empirical value
     356             :     such threads will have at least one normal page of work of keys to
     357             :     process in a block typically. */
     358             : 
     359           0 :   if( FD_UNLIKELY( cnt<2L ) ) return key;
     360             : 
     361           0 :   static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
     362           0 :   ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
     363           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable_fast)( key, cnt, scratch );
     364           0 :   t1 = t0 + t_cnt;
     365             : 
     366           0 :   SORT_KEY_T * out[1];
     367           0 :   FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
     368           0 :   return out[0];
     369           0 : }
     370             : 
     371             : static inline SORT_KEY_T *
     372             : SORT_(stable_para)( fd_tpool_t * tpool,
     373             :                     ulong        t0,
     374             :                     ulong        t1,
     375             :                     SORT_KEY_T * key,
     376             :                     SORT_IDX_T   cnt,
     377           0 :                     void *       scratch ) {
     378             : 
     379             :   /* This works the same as the above but does a parallel memcpy if the
     380             :      result doesn't end up in the correct place. */
     381             : 
     382           0 :   if( FD_UNLIKELY( cnt<2L ) ) return key;
     383             : 
     384           0 :   static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
     385           0 :   ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
     386           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable)( key, cnt, scratch );
     387           0 :   t1 = t0 + t_cnt;
     388             : 
     389           0 :   SORT_KEY_T * out[1];
     390           0 :   FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
     391           0 :   if( out[0]!=key ) FD_FOR_ALL( SORT_(private_memcpy_para), tpool,t0,t1, 0L,(long)cnt, key, scratch ); /* 50/50 branch prob */
     392           0 :   return key;
     393           0 : }
     394             : 
     395             : SORT_STATIC void
     396             : SORT_(private_quick_node)( void * _tpool,
     397             :                            ulong  t0,      ulong t1,
     398             :                            void * _args,
     399             :                            void * _reduce, ulong _stride,
     400             :                            ulong  _l0,     ulong _l1,
     401             :                            ulong  _m0,     ulong _m1,
     402             :                            ulong  _n0,     ulong _n1 );
     403             : 
     404             : static inline SORT_KEY_T *
     405             : SORT_(inplace_para)( fd_tpool_t * tpool,
     406             :                      ulong        t0,
     407             :                      ulong        t1,
     408             :                      SORT_KEY_T * key,
     409           0 :                      SORT_IDX_T   cnt ) {
     410           0 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     411           0 :   SORT_(private_quick_node)( tpool,t0,t1, (void *)key, (void *)(ulong)cnt, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
     412           0 :   return key;
     413           0 : }
     414             : 
     415             : #if FD_HAS_ALLOCA
     416             : 
     417             : SORT_STATIC SORT_KEY_T *
     418             : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
     419             :                   SORT_KEY_T * key,
     420             :                   SORT_IDX_T   cnt,
     421             :                   void *       scratch,
     422             :                   ulong        seed,
     423             :                   int          stable );
     424             : 
     425             : #endif /* FD_HAS_ALLOCA */
     426             : 
     427             : #endif /* SORT_PARALLEL */
     428             : 
     429             : FD_PROTOTYPES_END
     430             : 
     431             : #endif
     432             : 
     433             : #if SORT_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
     434             : 
     435             : SORT_KEY_T *
     436             : SORT_(insert)( SORT_KEY_T * key,
     437     5147580 :                SORT_IDX_T   cnt ) {
     438   118479144 :   for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
     439   113331564 :     SORT_KEY_T key_i = key[i];
     440   113331564 :     SORT_IDX_T j = i;
     441   935723698 :     while( j ) {
     442   931917198 :       SORT_IDX_T k = j - ((SORT_IDX_T)1);
     443   931917198 :       SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
     444   822392134 :       key[j] = key_j;
     445   822392134 :       j = k;
     446   822392134 :     }
     447   113331564 :     key[j] = key_i;
     448   113331564 :   }
     449     5147580 :   return key;
     450     5147580 : }
     451             : 
     452             : static void
     453             : SORT_(private_merge_pass)( SORT_KEY_T const * key_l, long cnt_l,
     454             :                            SORT_KEY_T const * key_r, long cnt_r,
     455     2659872 :                            SORT_KEY_T       * key_m ) {
     456     2659872 :   long i = 0L;
     457     2659872 :   long j = 0L;
     458     2659872 :   long k = 0L;
     459     2659872 : # if 1 /* Minimal C language operations */
     460   144635166 :   for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
     461   144635166 :     if( SORT_BEFORE( key_r[j], key_l[i] ) ) {
     462    52344009 :       key_m[k++] = key_r[j++];
     463    52344009 :       if( j>=cnt_r ) {
     464      170760 :         memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) ); /* append left  stragglers (at least one) */
     465      170760 :         break;
     466      170760 :       }
     467    92291157 :     } else {
     468    92291157 :       key_m[k++] = key_l[i++];
     469    92291157 :       if( i>=cnt_l ) {
     470     2489112 :         memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) ); /* append right stragglers (at least one) */
     471     2489112 :         break;
     472     2489112 :       }
     473    92291157 :     }
     474   144635166 :   }
     475             : # else /* Branch free variant (Pyth investigations suggested the above is better) */
     476             :   while( ((i<cnt_l) & (j<cnt_r)) ) {
     477             :     SORT_KEY_T ki = key_l[ i ];
     478             :     SORT_KEY_T kj = key_r[ j ];
     479             :     int use_left = !(SORT_BEFORE( kj, ki ));
     480             :     key_m[ k ] = use_left ? ki : kj; /* cmov ideally */
     481             :     i += (long) use_left;
     482             :     j += (long)!use_left;
     483             :     k++;
     484             :   }
     485             :   if(      i<cnt_l ) memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) );
     486             :   else if( j<cnt_r ) memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) );
     487             : # endif
     488     2659872 : }
     489             : 
     490             : SORT_KEY_T *
     491             : SORT_(private_merge)( SORT_KEY_T * key,
     492             :                       long         cnt,
     493     6340224 :                       SORT_KEY_T * tmp ) {
     494             : 
     495             :   /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
     496             : 
     497             :   /* If below threshold, use insertion sort */
     498             : 
     499     6340224 :   if( cnt<=((long)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, (SORT_IDX_T)cnt );
     500             : 
     501             :   /* Otherwise, break input in half and sort the halves */
     502             : 
     503     2659872 :   SORT_KEY_T * key_l = key;
     504     2659872 :   SORT_KEY_T * tmp_l = tmp;
     505     2659872 :   long         cnt_l = cnt >> 1;
     506     2659872 :   SORT_KEY_T * in_l  = SORT_(private_merge)( key_l, cnt_l, tmp_l );
     507             : 
     508     2659872 :   SORT_KEY_T * key_r = key + cnt_l;
     509     2659872 :   SORT_KEY_T * tmp_r = tmp + cnt_l;
     510     2659872 :   long         cnt_r = cnt - cnt_l;
     511     2659872 :   SORT_KEY_T * in_r  = SORT_(private_merge)( key_r, cnt_r, tmp_r );
     512             : 
     513             :   /* Merge in_l / in_r */
     514             : 
     515     2659872 :   SORT_KEY_T * out = (in_l==key) ? tmp : key; /* If in_l overlaps with key, merge into tmp.  Otherwise, merge into key */
     516             : 
     517             :   /* Note that in_l does not overlap with out at this point.  in_r might
     518             :      overlap with the right half of out but the merge pass is fine for
     519             :      that case. */
     520             : 
     521     2659872 :   SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
     522             : 
     523     2659872 :   return out;
     524     6340224 : }
     525             : 
     526             : /* This uses a dual pivot quick sort for better theoretical and
     527             :    practical mojo. */
     528             : 
     529             : SORT_KEY_T *
     530             : SORT_(private_quick)( SORT_KEY_T * key,
     531     1998366 :                       SORT_IDX_T   cnt ) {
     532     1998366 :   SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
     533     1998366 :   ulong      stack_cnt = 0UL;
     534             : 
     535     1998366 :   stack[ stack_cnt++ ] = (SORT_IDX_T)0;
     536     1998366 :   stack[ stack_cnt++ ] = cnt;
     537     6451488 :   for(;;) {
     538             : 
     539             :     /* Pop the next partition to process */
     540             : 
     541     6451488 :     if( !stack_cnt ) return key; /* All done */
     542     4453122 :     SORT_IDX_T h = stack[ --stack_cnt ];
     543     4453122 :     SORT_IDX_T l = stack[ --stack_cnt ];
     544             : 
     545             :     /* [l,h) is the partition we are sorting.  If this partition is
     546             :        small enough, sort it via insertion sort. */
     547             : 
     548     4453122 :     SORT_IDX_T n = h-l;
     549     4453122 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
     550     1195776 :       SORT_(insert)( key+l, n );
     551     1195776 :       continue;
     552     1195776 :     }
     553             : 
     554             :     /* This partition is too large to insertion sort.  Pick two pivots,
     555             :        sort the partition into a left, center and right partition and
     556             :        then recursively sort those partitions.  We initially use a
     557             :        simple choice of pivots that is ideal for nearly sorted data (in
     558             :        either direction) with a little extra sampling to improve the
     559             :        partitioning for randomly ordered data.  There is a possibility
     560             :        that this deterministic choice of pivots will not succeed in
     561             :        making two or more non-empty partitions; we detect and correct
     562             :        that below. */
     563             : 
     564             :     /* Initial pivot selection */
     565             : 
     566     3257346 :     SORT_KEY_T p1;
     567     3257346 :     SORT_KEY_T p2;
     568     3257346 :     do {
     569     3257346 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
     570     3257346 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     571     3257346 :       SORT_KEY_T p0 = key[ l       ];
     572     3257346 :       /**/       p1 = key[ l  + n3 ];
     573     3257346 :       /**/       p2 = key[ h1 - n3 ];
     574     3257346 :       SORT_KEY_T p3 = key[ h1      ];
     575     3257346 : #     if SORT_QUICK_ORDER_STYLE==0
     576             :       /* Generates better code for integral types (branchless via stack) */
     577     3257346 :       SORT_KEY_T _k[2]; ulong _c;
     578    16286730 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     579             : #     else
     580             :       /* Generates much better code for floating point types (branchless
     581             :          via min / max floating point instructions) */
     582             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     583             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     584             : #     endif
     585             : 
     586     3257346 :       ORDER(p0,p2); ORDER(p1,p3);
     587     3257346 :       ORDER(p0,p1); ORDER(p2,p3);
     588     3257346 :       ORDER(p1,p2);
     589     3257346 : #     undef ORDER
     590     3257346 :     } while(0);
     591             : 
     592     3645678 :   retry: /* Three way partitioning (silly language restriction) */;
     593     3645678 :     SORT_IDX_T i = l;
     594     3645678 :     SORT_IDX_T j = l;
     595     3645678 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     596   509786277 :     do { /* Note [j,k] is non-empty (look ma ... no branches!) */
     597             : 
     598             :       /* At this point, p1 <= p2 and:
     599             :          - keys [l,i) are before p1 (left partition)
     600             :          - keys [i,j) are in [p1,p2] (center partition)
     601             :          - keys [j,k] are not yet partitioned (region is non-empty)
     602             :          - keys (k,h) are after p2 (right partition)
     603             :          Decide where key j should be moved. */
     604             : 
     605   509786277 :       SORT_KEY_T kj = key[j];
     606             : 
     607   509786277 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     608   509786277 :       int to_right = (SORT_BEFORE( p2, kj ));
     609   509786277 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     610             : 
     611             : #     if SORT_QUICK_SWAP_MINIMIZE
     612             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
     613             : #     else
     614             :       /**/                      key[j] = key[m]; key[m] = kj;
     615   509786277 : #     endif
     616             : 
     617   509786277 :       i += (SORT_IDX_T) to_left;
     618   509786277 :       j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
     619   509786277 :       k -= (SORT_IDX_T) to_right;
     620   509786277 :     } while( j<=k );
     621             : 
     622             :     /* Schedule recursion */
     623             : 
     624     3645678 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     625             : 
     626             :       /* All the keys ended up in the center partition.  If p1<p2, we
     627             :          had an unlucky choice of pivots such that p1==min(key[i]) and
     628             :          p2==max(key[i]) for i in [l,h).  Since we picked the keys
     629             :          deterministically above, we would have the same bad luck the
     630             :          next time we processed this partition.  But, because there are
     631             :          at least two distinct elements in the partition, there is a
     632             :          choice of pivots that will make a non-trivial partition, so we
     633             :          need to redo this partition with different pivots.
     634             : 
     635             :          Randomly picking a new pivot pair in [l,h) has probability
     636             :          between ~50% and ~100%.  The worst case is half the keys in
     637             :          the partition equal to p1 and the other half equal to p2; the
     638             :          best case exactly one key is equal to p1 / p2 and all other
     639             :          keys are in (p1,p2] / [p1,p2).
     640             : 
     641             :          The optimal handling of the worst case though is just to set
     642             :          p1=p2 (p2=p1).  This will pull the keys equal to p1 (p2) into
     643             :          the left (right) partition and the remaining keys into the
     644             :          center partition; the right (left) partition will be empty.
     645             :          Thus, this is guaranteed to yield a non-trivial partition of
     646             :          the center partition but may not yield as load balanced set of
     647             :          partitions as random for the next iteration.  We go with the
     648             :          deterministic option for simplicity below. */
     649             : 
     650     1550499 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     651             : 
     652             :       /* p1==p2 here ... all keys in this partition are the same
     653             :          We don't need to recurse further in this partition. */
     654             : 
     655     2095179 :     } else {
     656             : 
     657             :       /* Between [1,n-1] keys ended up in the center partition such that
     658             :          at least 1 key ended up in another partition.  We push all
     659             :          three partitions onto the partition stack for recursion.  The
     660             :          order of the pushes is important to bound the maximum stack
     661             :          usage.  Below, we push the center partition followed by the
     662             :          larger of the left and right partitions then followed by the
     663             :          smaller of the left and right partitions (such that we process
     664             :          the smallest next iteration).  For this, the same O(log_2 cnt)
     665             :          max recursion depth applies as quicksort even this does a three
     666             :          way partitioning.  That is, let fl/fc/fr be the fraction of
     667             :          keys in the left, center, right partitions.  At this point, fl
     668             :          is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
     669             :          The smaller of the left or right partition is what process next
     670             :          and, as the smaller of the left or right, it fraction of the
     671             :          keys is at most 0.5(1-fc)<=0.5.  We do push up to two
     672             :          partitions onto the stack (four SORT_IDX_T) each level of
     673             :          recursion though instead of one like normal quick sort though. */
     674             : 
     675     2095179 :       int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
     676     2095179 :       SORT_IDX_T l_larger  = SORT_IDX_IF( left_larger, l, j );
     677     2095179 :       SORT_IDX_T h_larger  = SORT_IDX_IF( left_larger, i, h );
     678     2095179 :       SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
     679     2095179 :       SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
     680             : 
     681             :       /* Immediately process empty partitions */
     682     2095179 :       ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
     683             : 
     684             :       /* Immediately process partitions where all keys are the same */
     685     2095179 :       ulong push_center  = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
     686             : 
     687             :       /* Guaranteed non-empty (larger of left or right have at least 1 key) */
     688     2095179 :       ulong push_larger  = 1UL;
     689             : 
     690     2095179 :       stack[ stack_cnt ] = l_larger;  stack_cnt += push_larger;
     691     2095179 :       stack[ stack_cnt ] = h_larger;  stack_cnt += push_larger;
     692     2095179 :       stack[ stack_cnt ] = i;         stack_cnt += push_center;
     693     2095179 :       stack[ stack_cnt ] = j;         stack_cnt += push_center;
     694     2095179 :       stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
     695     2095179 :       stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
     696     2095179 :     }
     697     3645678 :   }
     698             :   /* never get here */
     699     1998366 : }
     700             : 
     701             : /* This works identical to sort_private_quick.  Only differences
     702             :    relevant to selection are commented (in short, we only recurse on the
     703             :    partition that could contain rank).  See above for comments on the
     704             :    algo. */
     705             : 
     706             : SORT_KEY_T *
     707             : SORT_(private_select)( SORT_KEY_T * key,
     708             :                        SORT_IDX_T   cnt,
     709      231492 :                        SORT_IDX_T   rnk ) {
     710      231492 :   SORT_IDX_T l = (SORT_IDX_T)0;
     711      231492 :   SORT_IDX_T h = cnt;
     712      748695 :   for(;;) {
     713             : 
     714             :     /* The partition [l,h) contains rnk.  If this partition is small
     715             :        enough, sort it via insertion sort.  FIXME: probably could
     716             :        truncate the insertion sort early for further optimization (e.g.
     717             :        if rnk==l / rnk==h-1, we only need to find the min/max for O(n)
     718             :        operation). */
     719             : 
     720      748695 :     SORT_IDX_T n = h-l;
     721      748695 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
     722             : 
     723      517227 :     SORT_KEY_T p1;
     724      517227 :     SORT_KEY_T p2;
     725      517227 :     do {
     726      517227 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3;
     727      517227 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     728      517227 :       SORT_KEY_T p0 = key[ l       ];
     729      517227 :       /**/       p1 = key[ l  + n3 ];
     730      517227 :       /**/       p2 = key[ h1 - n3 ];
     731      517227 :       SORT_KEY_T p3 = key[ h1      ];
     732      517227 : #     if SORT_QUICK_ORDER_STYLE==0
     733      517227 :       SORT_KEY_T _k[2]; ulong _c;
     734     2586135 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     735             : #     else
     736             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     737             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     738             : #     endif
     739             : 
     740      517227 :       ORDER(p0,p2); ORDER(p1,p3);
     741      517227 :       ORDER(p0,p1); ORDER(p2,p3);
     742      517227 :       ORDER(p1,p2);
     743      517227 : #     undef ORDER
     744      517227 :     } while(0);
     745             : 
     746      517234 :   retry: /* Silly language restriction */;
     747      517234 :     SORT_IDX_T i = l;
     748      517234 :     SORT_IDX_T j = l;
     749      517234 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     750    57778525 :     do {
     751             : 
     752    57778525 :       SORT_KEY_T kj = key[j];
     753             : 
     754    57778525 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     755    57778525 :       int to_right = (SORT_BEFORE( p2, kj ));
     756    57778525 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     757             : 
     758             : #     if SORT_QUICK_SWAP_MINIMIZE
     759             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
     760             : #     else
     761             :       /**/                      key[j] = key[m]; key[m] = kj;
     762    57778525 : #     endif
     763             : 
     764    57778525 :       i += (SORT_IDX_T) to_left;
     765    57778525 :       j += (SORT_IDX_T)(to_right^1);
     766    57778525 :       k -= (SORT_IDX_T) to_right;
     767    57778525 :     } while( j<=k );
     768             : 
     769      517234 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     770             : 
     771          31 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     772             : 
     773             :       /* p1==p2 here ... all keys in this partition are the same.  We
     774             :          don't need to recurse further in this partition as all keys
     775             :          would do here. */
     776             : 
     777          24 :       return key;
     778          31 :     }
     779             : 
     780             :     /* At this point:
     781             :        - [l,i) is the left partition,
     782             :        - [i,j) is the center partition
     783             :        - [j,h) is the right partition
     784             :        Between [1,n-1] keys ended up in the center partition such that
     785             :        at least 1 key ended up in another partition.  Recurse on the
     786             :        partition that contains rnk.  As this is typically used in median
     787             :        finding, we assume that the center partition is the most likely
     788             :        in the below selection (this can also be done branchlessly). */
     789             : 
     790      517203 :     SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
     791      517203 :     if( FD_UNLIKELY( rnk< i   ) ) l_next = l, h_next = i;
     792      517203 :     if( FD_UNLIKELY( j  <=rnk ) ) l_next = j, h_next = h;
     793      517203 :     l = l_next; h = h_next;
     794      517203 :   }
     795             :   /* never get here */
     796      231492 : }
     797             : 
     798             : #if SORT_PARALLEL
     799             : 
     800             : /* This works identical to sort_private_quick.  Only differences
     801             :    relevant to parallelization are commented.  See above for comments on
     802             :    the algo. */
     803             : 
     804             : void
     805             : SORT_(private_quick_node)( void * _tpool,
     806             :                            ulong  t0,      ulong t1,
     807             :                            void * _args,
     808             :                            void * _reduce, ulong _stride,
     809             :                            ulong  _l0,     ulong _l1,
     810             :                            ulong  _m0,     ulong _m1,
     811           0 :                            ulong  _n0,     ulong _n1 ) {
     812           0 :   (void)_stride; (void)_l0; (void)_l1; (void)_m0; (void)_m1; (void)_n0; (void)_n1;
     813             : 
     814           0 :   fd_tpool_t * tpool = (fd_tpool_t *)     _tpool;
     815           0 :   SORT_KEY_T * key   = (SORT_KEY_T *)     _args;
     816           0 :   SORT_IDX_T   cnt   = (SORT_IDX_T)(ulong)_reduce;
     817             : 
     818           0 :   SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ];
     819           0 :   ulong      stack_cnt = 0UL;
     820             : 
     821           0 :   ulong wait_stack[ 82 ]; /* See note below for sizing considerations */
     822           0 :   ulong wait_stack_cnt = 0UL;
     823             : 
     824           0 :   stack[ stack_cnt++ ] = (SORT_IDX_T)0;
     825           0 :   stack[ stack_cnt++ ] = cnt;
     826             : 
     827           0 :   while( stack_cnt ) {
     828             : 
     829           0 :     SORT_IDX_T h = stack[ --stack_cnt ];
     830           0 :     SORT_IDX_T l = stack[ --stack_cnt ];
     831             : 
     832           0 :     SORT_IDX_T n = h-l;
     833           0 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
     834           0 :       SORT_(insert)( key+l, n );
     835           0 :       continue;
     836           0 :     }
     837             : 
     838           0 :     SORT_KEY_T p1;
     839           0 :     SORT_KEY_T p2;
     840           0 :     do {
     841           0 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3;
     842           0 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     843           0 :       SORT_KEY_T p0 = key[ l       ];
     844           0 :       /**/       p1 = key[ l  + n3 ];
     845           0 :       /**/       p2 = key[ h1 - n3 ];
     846           0 :       SORT_KEY_T p3 = key[ h1      ];
     847           0 : #     if SORT_QUICK_ORDER_STYLE==0
     848           0 :       SORT_KEY_T _k[2]; ulong _c;
     849           0 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     850             : #     else
     851             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     852             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     853             : #     endif
     854             : 
     855           0 :       ORDER(p0,p2); ORDER(p1,p3);
     856           0 :       ORDER(p0,p1); ORDER(p2,p3);
     857           0 :       ORDER(p1,p2);
     858           0 : #     undef ORDER
     859           0 :     } while(0);
     860             : 
     861           0 :   retry: /* Silly language restriction */;
     862           0 :     SORT_IDX_T i = l;
     863           0 :     SORT_IDX_T j = l;
     864           0 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     865           0 :     do {
     866           0 :       SORT_KEY_T kj = key[j];
     867             : 
     868           0 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     869           0 :       int to_right = (SORT_BEFORE( p2, kj ));
     870           0 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     871             : 
     872             : #     if SORT_QUICK_SWAP_MINIMIZE
     873             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
     874             : #     else
     875             :       /**/                      key[j] = key[m]; key[m] = kj;
     876           0 : #     endif
     877             : 
     878           0 :       i += (SORT_IDX_T) to_left;
     879           0 :       j += (SORT_IDX_T)(to_right^1);
     880           0 :       k -= (SORT_IDX_T) to_right;
     881           0 :     } while( j<=k );
     882             : 
     883           0 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     884             : 
     885           0 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     886             : 
     887           0 :     } else {
     888             : 
     889             :       /* At this point, we have at most 3 partitions to sort and can use
     890             :          the caller and tpool threads (t0,t1) to sort them.  To load
     891             :          balance this, we sort the partitions by an estimate of how much
     892             :          work each requires.  We estimate this as proportional to:
     893             : 
     894             :            work = cnt ceil log_3 cnt if cnt>0 and 0 otherwise
     895             : 
     896             :          where cnt is the number of keys in the partition.  Since this
     897             :          is a rough estimate used for load balancing, we don't need a
     898             :          precise log_3 calculation.  With log_3 x ~ 0.633 lg x, we
     899             :          approximate:
     900             : 
     901             :            ceil log_3 x ~ ceil( 5 (lg x) / 8 )
     902             :                         ~ ceil( 5 (floor lg x) / 8)
     903             :                         = floor( (5 (floor lg x) + 7)/8 )
     904             :                         = (5 (msb x) + 7) >> 3
     905             : 
     906             :          This approximation is trivial to compute, is monotonic and is
     907             :          either exact or 1 less for x in (0,2^64).  The approximation is
     908             :          such that cnt==1 / cnt>1 yields zero / positive.  As such,
     909             :          work==0 strictly indicates no additional work is necessary. */
     910             : 
     911           0 : #     define WORK_APPROX(cnt) (((ulong)(cnt))*(ulong)((5*fd_ulong_find_msb_w_default( (ulong)(cnt), 0 )+7) >> 3))
     912             : 
     913           0 :       struct { SORT_IDX_T l; SORT_IDX_T h; ulong work; } part[3];
     914             : 
     915           0 :       part[0].l = l; part[0].h = i; part[0].work =                                     WORK_APPROX( i-l );
     916           0 :       part[1].l = i; part[1].h = j; part[1].work = fd_ulong_if( SORT_BEFORE( p1, p2 ), WORK_APPROX( j-i ), 0UL );
     917           0 :       part[2].l = j; part[2].h = h; part[2].work =                                     WORK_APPROX( h-j );
     918             : 
     919           0 : #     undef WORK_APPROX
     920             : 
     921           0 :       ulong work_remain = part[0].work + part[1].work + part[2].work;
     922             : 
     923           0 : #     define ORDER(i,j) fd_swap_if( part[j].work > part[i].work, part[i], part[j] )
     924             : 
     925           0 :       ORDER(0,2); ORDER(0,1); ORDER(1,2);
     926             : 
     927           0 : #     undef ORDER
     928             : 
     929           0 :       for( ulong idx=0UL; idx<3UL; idx++ ) {
     930             : 
     931             :         /* At this point, we need to schedule partitions [idx,2] requiring
     932             :            work_remain total estimated work and we have the caller and
     933             :            tpool threads (t0,t1) to do this work.  Compute which threads
     934             :            should execute partition idx. */
     935             : 
     936           0 :         ulong p_work = part[idx].work;
     937           0 :         if( FD_UNLIKELY( !p_work ) ) break; /* no work need for this partition and remainin partitions */
     938             : 
     939           0 :         SORT_IDX_T p_l = part[idx].l;
     940           0 :         SORT_IDX_T p_h = part[idx].h;
     941             : 
     942           0 :         ulong t_cnt = t1 - t0;
     943           0 :         if( FD_UNLIKELY( (t_cnt>1UL) & (p_work<work_remain) ) ) {
     944             : 
     945             :           /* At this point, we have at least two partitions that need
     946             :              work to sort and at least two threads remaining for
     947             :              sorting.  Schedule this partition for execution on between
     948             :              1 and t_cnt-1 of the remaining threads roughly proportional
     949             :              to the estimated work to sort this partition versus the
     950             :              total estimated work remaining.
     951             : 
     952             :              For the wait stack, in the worst case, we schedule two
     953             :              partitions for remote execution covering two thirds of the
     954             :              keys each round.  This implies the wait stack depth is
     955             :              bounded by:
     956             : 
     957             :                2 ceil log_3 thread_cnt
     958             : 
     959             :              Practically, thread_cnt<=FD_TILE_MAX.  For
     960             :              FD_TILE_MAX==1024, 14 is sufficient and 82 is sufficient
     961             :              for any thread_cnt representable by a ulong. */
     962             : 
     963           0 :           ulong ts = t1 - fd_ulong_min( fd_ulong_max( (t_cnt*p_work + (work_remain>>1)) / work_remain, 1UL ), t_cnt-1UL );
     964             : 
     965           0 :           fd_tpool_exec( tpool, ts, SORT_(private_quick_node), tpool, ts, t1, key + p_l, (void *)(ulong)(p_h - p_l),
     966           0 :                          0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
     967             : 
     968           0 :           wait_stack[ wait_stack_cnt++ ] = ts;
     969             : 
     970             :           /* Advance t1 and work_remain for the next iteration */
     971             : 
     972           0 :           t1           = ts;
     973           0 :           work_remain -= p_work;
     974             : 
     975           0 :         } else {
     976             : 
     977             :           /* At this point, we have only one thread available and/or
     978             :              this is the only partition remaining that needs sorting.
     979             :              Schedule this partition for local execution. */
     980             : 
     981           0 :           stack[ stack_cnt++ ] = p_l;
     982           0 :           stack[ stack_cnt++ ] = p_h;
     983             : 
     984           0 :         }
     985           0 :       }
     986           0 :     }
     987           0 :   }
     988             : 
     989             :   /* Wait for any children to finish */
     990             : 
     991           0 :   while( wait_stack_cnt ) fd_tpool_wait( tpool, wait_stack[ --wait_stack_cnt ] );
     992           0 : }
     993             : 
     994           0 : FD_MAP_REDUCE_BEGIN( SORT_(private_merge_para), 1L, 0UL, sizeof(SORT_KEY_T *), 1L ) {
     995             : 
     996           0 :   SORT_KEY_T ** out = (SORT_KEY_T **)arg[0];
     997           0 :   SORT_KEY_T *  key = (SORT_KEY_T * )arg[1];
     998           0 :   SORT_KEY_T *  tmp = (SORT_KEY_T * )arg[2];
     999             : 
    1000           0 :   *out = SORT_(stable_fast)( key + block_i0, (SORT_IDX_T)block_cnt, tmp + block_i0 );
    1001             : 
    1002           0 : } FD_MAP_END {
    1003             : 
    1004           0 :   SORT_KEY_T *  key   = (SORT_KEY_T * )arg[1];
    1005           0 :   SORT_KEY_T *  tmp   = (SORT_KEY_T * )arg[2];
    1006           0 :   SORT_KEY_T ** _in_l = (SORT_KEY_T **)arg[0]; SORT_KEY_T * in_l = *_in_l; long cnt_l = block_is - block_i0;
    1007           0 :   SORT_KEY_T ** _in_r = (SORT_KEY_T **)_r1;    SORT_KEY_T * in_r = *_in_r; long cnt_r = block_i1 - block_is;
    1008             : 
    1009             :   /* Merge in_l / in_r (see private_merge above for details) */
    1010             : 
    1011           0 :   SORT_KEY_T * out = ((in_l==(key+block_i0)) ? tmp : key) + block_i0; /* cmov */
    1012             : 
    1013           0 :   SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
    1014             : 
    1015           0 :   *_in_l = out;
    1016             : 
    1017           0 : } FD_REDUCE_END
    1018             : 
    1019           0 : FD_FOR_ALL_BEGIN( SORT_(private_memcpy_para), 1L ) {
    1020             : 
    1021           0 :   SORT_KEY_T       * dst = (SORT_KEY_T       *)arg[0];
    1022           0 :   SORT_KEY_T const * src = (SORT_KEY_T const *)arg[1];
    1023             : 
    1024           0 :   if( FD_LIKELY( block_cnt ) ) memcpy( dst + block_i0, src + block_i0, sizeof(SORT_KEY_T)*(ulong)block_cnt );
    1025             : 
    1026           0 : } FD_FOR_ALL_END
    1027             : 
    1028             : #if FD_HAS_ALLOCA
    1029             : 
    1030           0 : static FD_FOR_ALL_BEGIN( SORT_(private_cntcpy_para), 1L ) {
    1031           0 :   ulong              tpool_base = (ulong             )arg[0];
    1032           0 :   ulong              t_cnt      = (ulong             )arg[1];
    1033           0 :   ulong            * _key_cnt   = (ulong            *)arg[2];
    1034           0 :   SORT_KEY_T const * key        = (SORT_KEY_T const *)arg[3];
    1035           0 :   SORT_KEY_T       * tmp        = (SORT_KEY_T       *)arg[4];
    1036           0 :   SORT_KEY_T const * pivot      = (SORT_KEY_T const *)arg[5];
    1037             : 
    1038             :   /* Note keys in [ pivot[t-1], pivot[t] ) for t in [0,t_cnt) are keys
    1039             :      assigned to thread t for sorting.  pivot[-1] / pivot[t_cnt-1] are
    1040             :      an implied -infinity / +infinity and never explicitly accessed.  As
    1041             :      such, pivot is indexed [0,t_cnt-1). */
    1042             : 
    1043             :   /* Allocate and clear a local scratch for counting.  We don't count
    1044             :      directly into key_cnt to avoid false sharing while counting. */
    1045             : 
    1046           0 :   ulong * key_cnt = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
    1047             : 
    1048           0 :   memset( key_cnt, 0, sizeof(ulong)*t_cnt );
    1049             : 
    1050           0 :   for( long i=block_i0; i<block_i1; i++ ) {
    1051           0 :     SORT_KEY_T ki = key[i];
    1052             : 
    1053             :     /* Determine which thread is responsible for subsorting this key */
    1054             :     /* FIXME: ideally, use a unrolled fixed count iteration for here */
    1055             : 
    1056           0 :     ulong l = 0UL;
    1057           0 :     ulong h = t_cnt;
    1058           0 :     for(;;) {
    1059           0 :       ulong n = h - l;
    1060           0 :       if( n<2UL ) break;
    1061             : 
    1062             :       /* At this point, the thread for key i is in [l,h) and this range
    1063             :          contains at least two threads.  Split this range in half. */
    1064             : 
    1065           0 :       ulong m = l + (n>>1); /* In [1,t_cnt) */
    1066           0 :       int   c = SORT_BEFORE( ki, pivot[m-1UL] );
    1067             : 
    1068             :       /* If ki is before pivot[m], the target thread is in [l,m).
    1069             :          Otherwise, the target thread is in [m,h). */
    1070             : 
    1071           0 :       l = fd_ulong_if( c, l, m );
    1072           0 :       h = fd_ulong_if( c, m, h );
    1073           0 :     }
    1074             : 
    1075             :     /* At this point, key[i] should be assigned to thread l.  Count it
    1076             :        and copy it into tmp to prepare to scatter. */
    1077             : 
    1078           0 :     key_cnt[l]++;
    1079           0 :     tmp[i] = ki;
    1080           0 :   }
    1081             : 
    1082             :   /* Send the counts back to main thread for partitioning */
    1083             : 
    1084           0 :   memcpy( _key_cnt + (tpool_t0-tpool_base)*t_cnt, key_cnt, sizeof(ulong)*t_cnt );
    1085             : 
    1086           0 : } FD_FOR_ALL_END
    1087             : 
    1088           0 : static FD_FOR_ALL_BEGIN( SORT_(private_scatter_para), 1L ) {
    1089           0 :   ulong              tpool_base = (ulong             )arg[0];
    1090           0 :   ulong              t_cnt      = (ulong             )arg[1];
    1091           0 :   ulong      const * _part      = (ulong      const *)arg[2];
    1092           0 :   SORT_KEY_T       * key        = (SORT_KEY_T       *)arg[3];
    1093           0 :   SORT_KEY_T const * tmp        = (SORT_KEY_T const *)arg[4];
    1094           0 :   SORT_KEY_T const * pivot      = (SORT_KEY_T const *)arg[5];
    1095             : 
    1096             :   /* Receive the key array partitioning from the main thread.  Like the
    1097             :      above, we don't operate on the part array directly to avoid false
    1098             :      sharing while scattering. */
    1099             : 
    1100           0 :   ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
    1101             : 
    1102           0 :   memcpy( part, _part + (tpool_t0-tpool_base)*t_cnt, sizeof(ulong)*t_cnt );
    1103             : 
    1104           0 :   for( long i=block_i0; i<block_i1; i++ ) {
    1105           0 :     SORT_KEY_T ki = tmp[i];
    1106             : 
    1107             :     /* Determine which thread is responsible for subsort this key
    1108             :        (again).  Identical considerations to the above.  Note: we can
    1109             :        eliminate this computation if willing to burn O(N) additional
    1110             :        scratch memory by saving results computed above for use here. */
    1111             : 
    1112           0 :     ulong l = 0UL;
    1113           0 :     ulong h = t_cnt;
    1114           0 :     for(;;) {
    1115           0 :       ulong n = h - l;
    1116           0 :       if( n<2UL ) break;
    1117           0 :       ulong m = l + (n>>1);
    1118           0 :       int   c = SORT_BEFORE( ki, pivot[m-1UL] );
    1119           0 :       l = fd_ulong_if( c, l, m );
    1120           0 :       h = fd_ulong_if( c, m, h );
    1121           0 :     }
    1122             : 
    1123             :     /* Send this key to target thread */
    1124             : 
    1125           0 :     key[ part[l]++ ] = ki;
    1126           0 :   }
    1127             : 
    1128           0 : } FD_FOR_ALL_END
    1129             : 
    1130           0 : static FD_FOR_ALL_BEGIN( SORT_(private_subsort_para), 1L ) {
    1131           0 :   ulong const * part   = (ulong const *)arg[0];
    1132           0 :   SORT_KEY_T  * key    = (SORT_KEY_T  *)arg[1];
    1133           0 :   SORT_KEY_T  * tmp    = (SORT_KEY_T  *)arg[2];
    1134           0 :   int           stable = (int          )arg[3];
    1135             : 
    1136           0 :   ulong j0 = part[ block_i0 ];
    1137           0 :   ulong j1 = part[ block_i1 ];
    1138             : 
    1139           0 :   if( stable ) SORT_(stable) ( key + j0, j1 - j0, tmp + j0 );
    1140           0 :   else         SORT_(inplace)( key + j0, j1 - j0 );
    1141             : 
    1142           0 : } FD_FOR_ALL_END
    1143             : 
    1144             : SORT_KEY_T *
    1145             : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
    1146             :                   SORT_KEY_T * key,
    1147             :                   SORT_IDX_T   cnt,
    1148             :                   void *       scratch,
    1149             :                   ulong        seed,
    1150           0 :                   int          stable ) {
    1151             : 
    1152           0 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key; /* nothing to do */
    1153             : 
    1154             :   /* For the below (if sampling ops are fully threaded), sampling costs
    1155             :      O(S), the sample sort costs O(S lg(T S)), the downsampling costs
    1156             :      O(1), the partitioning costs O(T), the counting and scattering cost
    1157             :      O(N (lg T) / T), the subsorts cost (N/T) lg(N/T) and thread
    1158             :      synchronization overhead is O(lg T).
    1159             : 
    1160             :      Combining all these, assuming the sampling ratio S is a fixed O(1)
    1161             :      quantity and assuming N (lg T) / T term in the counting/scattering
    1162             :      approximately cancals the -N lg T / T in the subsorts yields a
    1163             :      wallclock that scales as:
    1164             : 
    1165             :        alpha N (ln N) / T + beta ln T + gamma T
    1166             : 
    1167             :      The first term represents the parallelized sorting cost, the second
    1168             :      term represents the thread dispatch / sync cost, and the last term
    1169             :      represents the partitioning costs.  Minimizing cost with respect
    1170             :      to T yields:
    1171             : 
    1172             :        -alpha (N ln N) / T_opt^2 + beta / T_opt + gamma = 0
    1173             : 
    1174             :      whose (positive) solution is:
    1175             : 
    1176             :        T_opt ~ (0.5 beta / gamma) [ sqrt( 1 + (4 alpha gamma N ln N)/beta^2) ) - 1 ]
    1177             : 
    1178             :      In the limit 4 alpha gamma N ln N / beta^2 >> 1 (that is,
    1179             :      partitioning costs are much lower than thread start/stop costs), we
    1180             :      have:
    1181             : 
    1182             :        T_opt -> (alpha N ln N) / beta
    1183             : 
    1184             :      In the other limit, we have:
    1185             : 
    1186             :        T_opt -> sqrt( alpha N ln N / gamma )
    1187             : 
    1188             :      The first limit tends to apply in practice.  Thus we use:
    1189             : 
    1190             :        T_opt ~ (N lg N) / thresh
    1191             :              ~ floor( (N msb N + N/2 + thresh/2) / thresh)
    1192             : 
    1193             :      where thresh is an empirical minimum amount of sorting work to
    1194             :      justify starting / stopping a thread.  (As written below, the gamma
    1195             :      term is more like gamma T^2, which makes the other limit more like
    1196             :      the cube root of N ln N but doesn't change the the overall
    1197             :      conclusion.) */
    1198             : 
    1199           0 :   ulong thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T);
    1200           0 :   ulong t_cnt  = fd_ulong_min( t1 - t0, (cnt*(ulong)fd_ulong_find_msb( cnt ) + ((cnt+thresh)>>1)) / thresh );
    1201           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return stable ? SORT_(stable)( key, cnt, scratch ) : SORT_(inplace)( key, cnt );
    1202           0 :   t1 = t0 + t_cnt;
    1203             : 
    1204             :   /* At this point, we have at least 2 threads available and at least 2
    1205             :      items to sort.  Sample the keys to get some idea of their
    1206             :      distribution, sort the samples and  downsample the sorted samples
    1207             :      into pivots that approximately uniformly partition the samples into
    1208             :      t_cnt groups (and thus approximately partition the keys uniformly
    1209             :      too).  Notes:
    1210             : 
    1211             :      - The sampling below is practically uniform IID but it could be
    1212             :        made more robust with a rejection method (ala fd_rng_ulong_roll).
    1213             : 
    1214             :      - Increasing SORT_OVERSAMPLE_RATIO improves the uniformity of the
    1215             :        partitioning of key space over the inputs but requires more stack
    1216             :        scratch memory and more overhead.
    1217             : 
    1218             :      - All three steps here could be parallelized but it is probably not
    1219             :        worth it given the overhead for threaded versus the number of
    1220             :        samples.
    1221             : 
    1222             :      - For a stable sort, the result is completely deterministic even
    1223             :        if the seed provided is non-deterministic. */
    1224             : 
    1225           0 :   ulong        sample_cnt = t_cnt*(ulong)SORT_OVERSAMPLE_RATIO;
    1226           0 :   SORT_KEY_T * pivot      = fd_alloca( alignof(ulong), sizeof(ulong)*sample_cnt );
    1227             : 
    1228           0 :   for( ulong i=0UL; i<sample_cnt; i++ ) pivot[i] = key[ fd_ulong_hash( seed ^ i ) % cnt ];
    1229             : 
    1230           0 :   SORT_(inplace)( pivot, sample_cnt );
    1231             : 
    1232           0 :   for( ulong i=1UL; i<t_cnt; i++ ) pivot[i-1UL] = pivot[ i*(ulong)SORT_OVERSAMPLE_RATIO ];
    1233             : 
    1234             :   /* At this point, keys in [ pivot[t-1], pivot[t] ) should be assigned
    1235             :      to thread t for thread t's subsort.  pivot[-1] / pivot[t_cnt-1] are
    1236             :      an implicit -infinity / +infinity.  Split the input array equally
    1237             :      over threads and have each thread copy their block of keys into tmp
    1238             :      and count the number of keys in its block that could be assigned to
    1239             :      each thread. */
    1240             : 
    1241           0 : # define part(i,j) part[ (i) + t_cnt*(j) ]
    1242             : 
    1243           0 :   ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt*t_cnt );
    1244             : 
    1245           0 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
    1246             : 
    1247           0 :   FD_FOR_ALL( SORT_(private_cntcpy_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
    1248             : 
    1249             :   /* At this point, part(i,j) is the count of the number of keys in thread
    1250             :      j's block that were assigned to thread i's subsort.  Convert this
    1251             :      into a partitioning.  In-principle this can be parallelized but
    1252             :      this is rarely worth it practically. */
    1253             : 
    1254           0 :   ulong k = 0UL;
    1255           0 :   for( ulong i=0UL; i<t_cnt; i++ ) {
    1256           0 :     for( ulong j=0UL; j<t_cnt; j++ ) {
    1257           0 :       ulong c_ij = part(i,j);
    1258           0 :       part(i,j) = k;
    1259           0 :       k += c_ij;
    1260           0 :     }
    1261           0 :   }
    1262             : 
    1263             :   /* At this point, the range [ part(i,j), part(i+1,j) ) is where keys
    1264             :      in thread j's block assigned to thread i's subsort should be
    1265             :      scattered.  part(t_cnt,t_cnt-1) is an implied cnt.  Scatter the
    1266             :      keys from tmp back into key in subsorted order. */
    1267             : 
    1268           0 :   FD_FOR_ALL( SORT_(private_scatter_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
    1269             : 
    1270             :   /* At this point, keys [ part(i,0), part(i+1,0) ) are the keys assigned
    1271             :      to thread i for subsorting.  part(t_cnt,0) is an implied cnt.
    1272             :      Since t_cnt>1 though this location exists.  We make that explicit
    1273             :      such that part[i],part[i+1] give the sets of keys each thread
    1274             :      should sort.  Do the thread parallel subsorts to get the keys into
    1275             :      final sorted order. */
    1276             : 
    1277           0 :   part[ t_cnt ] = cnt;
    1278             : 
    1279           0 :   FD_FOR_ALL( SORT_(private_subsort_para), tpool,t0,t1, 0L,(long)t_cnt, part, key, tmp, stable );
    1280             : 
    1281           0 : # undef part
    1282             : 
    1283           0 :   return key;
    1284           0 : }
    1285             : 
    1286             : #endif /* FD_HAS_ALLOCA */
    1287             : #endif /* SORT_PARALLEL */
    1288             : #endif /* SORT_IMPL_STYLE!=1 */
    1289             : 
    1290             : #undef SORT_
    1291             : #undef SORT_STATIC
    1292             : 
    1293             : #undef SORT_IMPL_STYLE
    1294             : #undef SORT_IDX_IF
    1295             : #undef SORT_OVERSAMPLE_RATIO
    1296             : #undef SORT_PARALLEL
    1297             : #undef SORT_QUICK_SWAP_MINIMIZE
    1298             : #undef SORT_QUICK_ORDER_STYLE
    1299             : #undef SORT_QUICK_THRESH
    1300             : #undef SORT_MERGE_THRESH
    1301             : #undef SORT_BEFORE
    1302             : #undef SORT_IDX_T
    1303             : #undef SORT_KEY_T
    1304             : #undef SORT_NAME

Generated by: LCOV version 1.14