LCOV - code coverage report
Current view: top level - util/tmpl - fd_sort.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 197 426 46.2 %
Date: 2025-12-13 04:39:40 Functions: 67 9038 0.7 %

          Line data    Source code
       1             : /* Declares a family of functions useful for single threaded sorting of
       2             :    POD types in high performance contexts.  Example usage:
       3             : 
       4             :      #define SORT_NAME        sort_double_descend
       5             :      #define SORT_KEY_T       double
       6             :      #define SORT_BEFORE(a,b) (a)>(b)
       7             :      #include "util/tmpl/fd_sort.c"
       8             : 
       9             :    will create the following API for use in the local compile unit:
      10             : 
      11             :      // Sort key[i] for i in [0,cnt) stable in place in a best / average
      12             :      // worst case of O(N) / O(N^2) / O(N^2) operations.
      13             : 
      14             :      double *
      15             :      sort_double_descend_insert( double * key,
      16             :                                  ulong    cnt );
      17             : 
      18             :      // Return 1 if cnt is a sensible value for the sorting APIs
      19             :      // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow,
      20             :      // 3*cnt*ceil(log_3(cnt)) will not overflow, etc).
      21             : 
      22             :      int sort_double_descend_cnt_valid( ulong cnt );
      23             : 
      24             :      // Return the alignment and footprint required for a scratch region
      25             :      // adequate for sorting up to cnt elements.  The results will be
      26             :      // standard allocator and standard declaration friendly.  (E.g. a
      27             :      // declaration "double scratch[ cnt ];" will be fine as a scratch
      28             :      // region.)
      29             : 
      30             :      ulong sort_double_descend_stable_scratch_align    ( void      );
      31             :      ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
      32             : 
      33             :      // Sort key[i] for i in [0,cnt) stable in best / average / worst
      34             :      // case of O(N lg N) / O(N lg N) / O(N lg N) operations.  Scratch
      35             :      // is a scratch workspace of suitable alignment and footprint.
      36             :      // Returns where the sorted values ended up.  Will be at either key
      37             :      // or (double *)scratch.
      38             : 
      39             :      double *
      40             :      sort_double_descend_stable_fast( double * key,
      41             :                                       ulong    cnt,
      42             :                                       void   * scratch );
      43             : 
      44             :      // Same as above but does additional copying if necessary such that
      45             :      // the final result ends up in key.  Returns key.
      46             : 
      47             :      double *
      48             :      sort_double_descend_stable( double * key,
      49             :                                  ulong    cnt,
      50             :                                  void *   scratch );
      51             : 
      52             :      // Sort key[i] for i in [0,cnt) inplace in best / average / worst
      53             :      // case of O(N lg N) / O(N lg N) / O(N^2) operations.  Returns key.
      54             : 
      55             :      double *
      56             :      sort_double_descend_inplace( double * key,
      57             :                                   ulong    cnt );
      58             : 
      59             :      // Partially sort key[i] such that key[rnk] holds rnk and return
      60             :      // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
      61             :      // operations.  Returns key.  Assumes key is non-NULL, cnt is valid
      62             :      // and positive and rnk is in [0,cnt)
      63             : 
      64             :      double *
      65             :      sort_double_descend_select( double * key,
      66             :                                  ulong    cnt,
      67             :                                  ulong    rnk );
      68             : 
      69             :      // Given a sorted array sorted indexed [0,cnt),
      70             :      // sort_double_descend_split returns an index i in [0,cnt] such all
      71             :      // entries in [0,i) are BEFORE query and all entries [i,cnt) are
      72             :      // NOT BEFORE query.
      73             : 
      74             :      ulong
      75             :      sort_double_descend_split( double const * sorted,
      76             :                                 ulong          cnt,
      77             :                                 double         query );
      78             : 
      79             :      // sort_double_descend_{stable_fast,stable,inplace}_para is the
      80             :      // same as // above is adaptively parallelized over the caller
      81             :      // (typically tpool thread t0) and tpool threads (t0,t1).  Assumes
      82             :      // tpool is valid, tpool threads (t0,t1) are idle (or soon to be
      83             :      // idle).  These are only available if SORT_PARALLEL was requested.
      84             : 
      85             :      double *
      86             :      sort_double_descend_stable_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
      87             :                                            double * key,
      88             :                                            ulong    cnt,
      89             :                                            void *   scratch );
      90             : 
      91             :      double *
      92             :      sort_double_descend_stable_para( fd_tpool_t * tpool, ulong t0, ulong t1,
      93             :                                       double * key,
      94             :                                       ulong    cnt,
      95             :                                       void *   scratch );
      96             : 
      97             :      double *
      98             :      sort_double_descend_inplace_para( fd_tpool_t * tpool, ulong t0, ulong t1,
      99             :                                        double * key,
     100             :                                        ulong    cnt );
     101             : 
     102             :      // sort_double_descend_fast_para has better asymptotically
     103             :      // parallelization on average (~(N lg N)/T + lg T) but requires
     104             :      // more stack space on the caller (T^2).  seed gives a random seed
     105             :      // to use (e.g. fd_tickcount()).  If stable is non-zero, a stable
     106             :      // method will be used (and the resulting order will always be
     107             :      // deterministic).  Otherwise a faster unstable method will be used
     108             :      // (the order in which equal keys end up in the array will depend
     109             :      // on seed).  Assumes tpool is valid and  tpool threads (t0,t1) are
     110             :      // idle (or soon to be idle).  Only available if SORT_PARALLEL was
     111             :      // requested and the target has FD_HAS_ALLOCA.
     112             : 
     113             :      double *
     114             :      sort_double_descend_fast_para( fd_tpool_t * tpool, ulong t0, ulong t1,
     115             :                                     double * key,
     116             :                                     ulong    cnt,
     117             :                                     void *   scratch,
     118             :                                     ulong    seed,
     119             :                                     int      stable );
     120             : 
     121             :    It is fine to include this template multiple times in a compilation
     122             :    unit.  Just provide the specification before each inclusion.  Various
     123             :    additional options to tune the methods are described below. */
     124             : 
     125             : /* SORT_NAME gives the name of the function to declare (and the base
     126             :    name of auxiliary and/or variant functions). */
     127             : 
     128             : #ifndef SORT_NAME
     129             : #error "SORT_NAME must be defined"
     130             : #endif
     131             : 
     132             : /* SORT_KEY_T gives the POD datatype to sort. */
     133             : 
     134             : #ifndef SORT_KEY_T
     135             : #error "SORT_KEY_T must be defined"
     136             : #endif
     137             : 
     138             : /* SORT_IDX_T gives the data type used to index the arrays */
     139             : 
     140             : #ifndef SORT_IDX_T
     141  1748115683 : #define SORT_IDX_T ulong
     142             : #endif
     143             : 
     144             : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true.  SAFETY TIP:
     145             :    This is not a 3-way comparison function! */
     146             : 
     147             : #ifndef SORT_BEFORE
     148     9120832 : #define SORT_BEFORE(a,b) (a)<(b)
     149             : #endif
     150             : 
     151             : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
     152             :    partition size where insertion sort will be used.  Should be at least
     153             :    2 (and probably larger than one might expect to get optimal
     154             :    performance). */
     155             : 
     156             : #ifndef SORT_MERGE_THRESH
     157     6340224 : #define SORT_MERGE_THRESH 32
     158             : #endif
     159             : 
     160             : #ifndef SORT_QUICK_THRESH
     161             : #define SORT_QUICK_THRESH 32
     162             : #endif
     163             : 
     164             : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
     165             :    Roughly, say 1 for floating point keys (see quick method for
     166             :    details). */
     167             : 
     168             : #ifndef SORT_QUICK_ORDER_STYLE
     169             : #define SORT_QUICK_ORDER_STYLE 0
     170             : #endif
     171             : 
     172             : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
     173             :    non-deterministic branch cost to avoid no-op swaps.  This is useful
     174             :    if the key_t has a large memory footprint such that the no-op is
     175             :    actually a larger cost than a branch mispredict. */
     176             : 
     177             : #ifndef SORT_QUICK_SWAP_MINIMIZE
     178             : #define SORT_QUICK_SWAP_MINIMIZE 0
     179             : #endif
     180             : 
     181             : /* SORT_PARALLEL will generate thread parallel versions of many sorts.
     182             :    Requires tpool. */
     183             : 
     184             : #ifndef SORT_PARALLEL
     185             : #define SORT_PARALLEL 0
     186             : #endif
     187             : 
     188             : /* SORT_OVERSAMPLE_RATIO indicates the amount of oversampling fast
     189             :    parallel sorts should use to fine pivots.  Only relevant if
     190             :    SORT_PARALLEL is true and FD_HAS_ALLOCA. */
     191             : 
     192             : #ifndef SORT_OVERSAMPLE_RATIO
     193           0 : #define SORT_OVERSAMPLE_RATIO 5
     194             : #endif
     195             : 
     196             : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
     197             : 
     198             : #ifndef SORT_IDX_IF
     199   604539500 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
     200             : #endif
     201             : 
     202             : /* SORT_FN_ATTR applies extra function attributes. */
     203             : 
     204             : #ifndef SORT_FN_ATTR
     205             : #define SORT_FN_ATTR
     206             : #endif
     207             : 
     208             : /* 0 - local use only
     209             :    1 - library header declaration
     210             :    2 - library implementation */
     211             : 
     212             : #ifndef SORT_IMPL_STYLE
     213             : #define SORT_IMPL_STYLE 0
     214             : #endif
     215             : 
     216             : /* Implementation *****************************************************/
     217             : 
     218             : #if SORT_IMPL_STYLE==0 /* local use only */
     219             : #define SORT_STATIC FD_FN_UNUSED static
     220             : #else /* library header and/or implementation */
     221             : #define SORT_STATIC
     222             : #endif
     223             : 
     224    21988184 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
     225             : 
     226             : #if SORT_IMPL_STYLE!=2 /* need header */
     227             : 
     228             : #if SORT_PARALLEL
     229             : #include "../tpool/fd_tpool.h"
     230             : #else
     231             : #include "../bits/fd_bits.h"
     232             : #endif
     233             : 
     234             : FD_PROTOTYPES_BEGIN
     235             : 
     236             : FD_FN_CONST static inline int
     237           0 : SORT_(cnt_valid)( SORT_IDX_T cnt ) {
     238           0 :   /* Written funny for supporting different signed idx types without
     239           0 :      generating compiler warnings. */
     240           0 :   SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
     241           0 :                                              (ulong)(SORT_IDX_T)((1UL<<57)-1UL) );      /* ==SORT_IDX_MAX as ulong */
     242           0 :   return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
     243           0 : }
     244             : 
     245           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align)    ( void )           { return alignof(SORT_KEY_T); }
     246           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
     247             : 
     248             : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
     249             : SORT_(private_merge)( SORT_KEY_T * key,
     250             :                       long         cnt,
     251             :                       SORT_KEY_T * tmp );
     252             : 
     253             : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
     254             : SORT_(private_quick)( SORT_KEY_T * key,
     255             :                       SORT_IDX_T   cnt );
     256             : 
     257             : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
     258             : SORT_(private_select)( SORT_KEY_T * key,
     259             :                        SORT_IDX_T   cnt,
     260             :                        SORT_IDX_T   rnk );
     261             : 
     262             : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
     263             : SORT_(insert)( SORT_KEY_T * key,
     264             :                SORT_IDX_T   cnt );
     265             : 
     266             : static inline SORT_KEY_T *
     267             : SORT_(stable_fast)( SORT_KEY_T * key,
     268             :                     SORT_IDX_T   cnt,
     269      510579 :                     void *       scratch ) {
     270      510579 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     271      510240 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     272      510240 :   return SORT_(private_merge)( key, (long)cnt, tmp );
     273      510579 : }
     274             : 
     275             : static inline SORT_KEY_T *
     276             : SORT_(stable)( SORT_KEY_T * key,
     277             :                SORT_IDX_T   cnt,
     278      510576 :                void *       scratch ) {
     279      510576 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     280      510240 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     281      510240 :   if( SORT_(private_merge)( key, (long)cnt, tmp )==tmp ) /* 50/50 branch prob */
     282      105270 :     memcpy( key, tmp, sizeof(SORT_KEY_T)*(ulong)cnt );
     283      510240 :   return key;
     284      510576 : }
     285             : 
     286             : static inline SORT_KEY_T *
     287             : SORT_(inplace)( SORT_KEY_T * key,
     288     4699236 :                 SORT_IDX_T   cnt ) {
     289     4699236 :   return SORT_(private_quick)( key, cnt );
     290     4699236 : }
     291             : 
     292             : static inline SORT_KEY_T *
     293             : SORT_(select)( SORT_KEY_T * key,
     294             :                SORT_IDX_T   cnt,
     295      231484 :                SORT_IDX_T   rnk ) {
     296      231484 :   return SORT_(private_select)( key, cnt, rnk );
     297      231484 : }
     298             : 
     299             : static inline SORT_IDX_T
     300             : SORT_(split)( SORT_KEY_T const * sorted,
     301             :               SORT_IDX_T         cnt,
     302        3372 :               SORT_KEY_T         query ) {
     303        3372 :   SORT_IDX_T j = (SORT_IDX_T)0;
     304        3372 :   SORT_IDX_T k = (SORT_IDX_T)cnt;
     305       18069 :   for(;;) {
     306       18069 :     SORT_IDX_T n = k-j;
     307       18069 :     if( FD_UNLIKELY( n<(SORT_IDX_T)1 ) ) break;
     308             : 
     309             :     /* At this point, entries [0,j) are known "lt"  query (i.e. before),
     310             :                       entries [j,k) are unknown (this range is non-empty and sorted),
     311             :                       entries [k,n) are known "geq" query (i.e. not before)
     312             :        Test the entry in the middle of the unknown range. */
     313             : 
     314       14697 :     SORT_IDX_T m = j + (n>>1);
     315             : 
     316       14697 :     int c = SORT_BEFORE( sorted[ m ], query );
     317             : 
     318             :     /* At this point:
     319             :         If c is 1, entry m is     before query.  As such, entries [0,m] are all known "lt"  query and range (m,k) is unknown now.
     320             :         If c is 0, entry m is not before query.  As such, entries [m,n) are all known "geq" query and range [j,m) is unknown now. */
     321             : 
     322       14697 :     j = c ? (m+(SORT_IDX_T)1) : j; /* cmov */
     323       14697 :     k = c ?  k                : m; /* cmov */
     324       14697 :   }
     325             : 
     326             :   /* At this point, [0,j) are known "lt" query and [k,n) are known
     327             :      "geq" query and j==k such that [j,k) is an empty range. */
     328             : 
     329        3372 :   return k;
     330        3372 : }
     331             : 
     332             : #if SORT_PARALLEL
     333             : 
     334             : SORT_STATIC FD_MAP_REDUCE_PROTO( SORT_(private_merge_para)   );
     335             : SORT_STATIC FD_FOR_ALL_PROTO   ( SORT_(private_memcpy_para)  );
     336             : 
     337             : static inline SORT_KEY_T *
     338             : SORT_(stable_fast_para)( fd_tpool_t * tpool,
     339             :                          ulong        t0,
     340             :                          ulong        t1,
     341             :                          SORT_KEY_T * key,
     342             :                          SORT_IDX_T   cnt,
     343           0 :                          void *       scratch ) {
     344             : 
     345             :   /* The wallclock time of the below for N keys and T threads is
     346             :      roughly:
     347             : 
     348             :        alpha N + beta N (ln N - ln T) / T + gamma ln T
     349             : 
     350             :      where the first term represents the cost of the final sort rounds
     351             :      (which are parallelized over increasingly fewer threads than the
     352             :      initial rounds ... wallclock sums up to something proportional to N
     353             :      in the limit of large T), the second term represents the cost of
     354             :      the initial parallel rounds (which are parallelized over T threads)
     355             :      and the last term represents the wallclock to dispatch and
     356             :      synchronize the threads.  Optimizing T for a given N yields:
     357             : 
     358             :        -beta N (ln N - ln T_opt) / T_opt^2 - beta N / T_opt^2 + gamma / T_opt = 0
     359             : 
     360             :      Solving for T_opt in the limit N >> T_opt yields:
     361             : 
     362             :        T_opt = (beta/gamma) N ln N
     363             : 
     364             :      where the ratio beta is roughly the merge pass marginal wallclock
     365             :      per key and gamma is proportional to the marginal wallclock to
     366             :      start and stop a thread.  Since these are typically used over a
     367             :      domain of moderate N, we can approximate ln N by ln N_ref,
     368             :      yielding:
     369             : 
     370             :        T_opt ~ N / thresh
     371             : 
     372             :     where thresh ~ gamma / (beta ln N_ref).  We use an empirical value
     373             :     such threads will have at least one normal page of work of keys to
     374             :     process in a block typically. */
     375             : 
     376           0 :   if( FD_UNLIKELY( cnt<2L ) ) return key;
     377             : 
     378           0 :   static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
     379           0 :   ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
     380           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable_fast)( key, cnt, scratch );
     381           0 :   t1 = t0 + t_cnt;
     382             : 
     383           0 :   SORT_KEY_T * out[1];
     384           0 :   FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
     385           0 :   return out[0];
     386           0 : }
     387             : 
     388             : static inline SORT_KEY_T *
     389             : SORT_(stable_para)( fd_tpool_t * tpool,
     390             :                     ulong        t0,
     391             :                     ulong        t1,
     392             :                     SORT_KEY_T * key,
     393             :                     SORT_IDX_T   cnt,
     394           0 :                     void *       scratch ) {
     395             : 
     396             :   /* This works the same as the above but does a parallel memcpy if the
     397             :      result doesn't end up in the correct place. */
     398             : 
     399           0 :   if( FD_UNLIKELY( cnt<2L ) ) return key;
     400             : 
     401           0 :   static ulong const thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T); /* ceil(page_sz/key_sz) */
     402           0 :   ulong t_cnt = fd_ulong_min( t1 - t0, ((ulong)cnt) / thresh );
     403           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return SORT_(stable)( key, cnt, scratch );
     404           0 :   t1 = t0 + t_cnt;
     405             : 
     406           0 :   SORT_KEY_T * out[1];
     407           0 :   FD_MAP_REDUCE( SORT_(private_merge_para), tpool,t0,t1, 0L,(long)cnt, out, key, scratch );
     408           0 :   if( out[0]!=key ) FD_FOR_ALL( SORT_(private_memcpy_para), tpool,t0,t1, 0L,(long)cnt, key, scratch ); /* 50/50 branch prob */
     409           0 :   return key;
     410           0 : }
     411             : 
     412             : SORT_FN_ATTR SORT_STATIC void
     413             : SORT_(private_quick_node)( void * _tpool,
     414             :                            ulong  t0,      ulong t1,
     415             :                            void * _args,
     416             :                            void * _reduce, ulong _stride,
     417             :                            ulong  _l0,     ulong _l1,
     418             :                            ulong  _m0,     ulong _m1,
     419             :                            ulong  _n0,     ulong _n1 );
     420             : 
     421             : static inline SORT_KEY_T *
     422             : SORT_(inplace_para)( fd_tpool_t * tpool,
     423             :                      ulong        t0,
     424             :                      ulong        t1,
     425             :                      SORT_KEY_T * key,
     426           0 :                      SORT_IDX_T   cnt ) {
     427           0 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key;
     428           0 :   SORT_(private_quick_node)( tpool,t0,t1, (void *)key, (void *)(ulong)cnt, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
     429           0 :   return key;
     430           0 : }
     431             : 
     432             : #if FD_HAS_ALLOCA
     433             : 
     434             : SORT_FN_ATTR SORT_STATIC SORT_KEY_T *
     435             : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
     436             :                   SORT_KEY_T * key,
     437             :                   SORT_IDX_T   cnt,
     438             :                   void *       scratch,
     439             :                   ulong        seed,
     440             :                   int          stable );
     441             : 
     442             : #endif /* FD_HAS_ALLOCA */
     443             : 
     444             : #endif /* SORT_PARALLEL */
     445             : 
     446             : FD_PROTOTYPES_END
     447             : 
     448             : #endif
     449             : 
     450             : #if SORT_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
     451             : 
     452             : SORT_FN_ATTR SORT_KEY_T *
     453             : SORT_(insert)( SORT_KEY_T * key,
     454     8873959 :                SORT_IDX_T   cnt ) {
     455   134563797 :   for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
     456   125689838 :     SORT_KEY_T key_i = key[i];
     457   125689838 :     SORT_IDX_T j = i;
     458   965119372 :     while( j ) {
     459   958785324 :       SORT_IDX_T k = j - ((SORT_IDX_T)1);
     460   958785324 :       SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
     461   839429534 :       key[j] = key_j;
     462   839429534 :       j = k;
     463   839429534 :     }
     464   125689838 :     key[j] = key_i;
     465   125689838 :   }
     466     8873959 :   return key;
     467     8873959 : }
     468             : 
     469             : SORT_FN_ATTR static void
     470             : SORT_(private_merge_pass)( SORT_KEY_T const * key_l, long cnt_l,
     471             :                            SORT_KEY_T const * key_r, long cnt_r,
     472     2659872 :                            SORT_KEY_T       * key_m ) {
     473     2659872 :   long i = 0L;
     474     2659872 :   long j = 0L;
     475     2659872 :   long k = 0L;
     476     2659872 : # if 1 /* Minimal C language operations */
     477   144635166 :   for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
     478   144635166 :     if( SORT_BEFORE( key_r[j], key_l[i] ) ) {
     479    52344009 :       key_m[k++] = key_r[j++];
     480    52344009 :       if( j>=cnt_r ) {
     481      170760 :         memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) ); /* append left  stragglers (at least one) */
     482      170760 :         break;
     483      170760 :       }
     484    92291157 :     } else {
     485    92291157 :       key_m[k++] = key_l[i++];
     486    92291157 :       if( i>=cnt_l ) {
     487     2489112 :         memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) ); /* append right stragglers (at least one) */
     488     2489112 :         break;
     489     2489112 :       }
     490    92291157 :     }
     491   144635166 :   }
     492             : # else /* Branch free variant (Pyth investigations suggested the above is better) */
     493             :   while( ((i<cnt_l) & (j<cnt_r)) ) {
     494             :     SORT_KEY_T ki = key_l[ i ];
     495             :     SORT_KEY_T kj = key_r[ j ];
     496             :     int use_left = !(SORT_BEFORE( kj, ki ));
     497             :     key_m[ k ] = use_left ? ki : kj; /* cmov ideally */
     498             :     i += (long) use_left;
     499             :     j += (long)!use_left;
     500             :     k++;
     501             :   }
     502             :   if(      i<cnt_l ) memcpy( key_m + k, key_l + i, sizeof(SORT_KEY_T)*(ulong)(cnt_l-i) );
     503             :   else if( j<cnt_r ) memcpy( key_m + k, key_r + j, sizeof(SORT_KEY_T)*(ulong)(cnt_r-j) );
     504             : # endif
     505     2659872 : }
     506             : 
     507             : SORT_FN_ATTR SORT_KEY_T *
     508             : SORT_(private_merge)( SORT_KEY_T * key,
     509             :                       long         cnt,
     510     6340224 :                       SORT_KEY_T * tmp ) {
     511             : 
     512             :   /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
     513             : 
     514             :   /* If below threshold, use insertion sort */
     515             : 
     516     6340224 :   if( cnt<=((long)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, (SORT_IDX_T)cnt );
     517             : 
     518             :   /* Otherwise, break input in half and sort the halves */
     519             : 
     520     2659872 :   SORT_KEY_T * key_l = key;
     521     2659872 :   SORT_KEY_T * tmp_l = tmp;
     522     2659872 :   long         cnt_l = cnt >> 1;
     523     2659872 :   SORT_KEY_T * in_l  = SORT_(private_merge)( key_l, cnt_l, tmp_l );
     524             : 
     525     2659872 :   SORT_KEY_T * key_r = key + cnt_l;
     526     2659872 :   SORT_KEY_T * tmp_r = tmp + cnt_l;
     527     2659872 :   long         cnt_r = cnt - cnt_l;
     528     2659872 :   SORT_KEY_T * in_r  = SORT_(private_merge)( key_r, cnt_r, tmp_r );
     529             : 
     530             :   /* Merge in_l / in_r */
     531             : 
     532     2659872 :   SORT_KEY_T * out = (in_l==key) ? tmp : key; /* If in_l overlaps with key, merge into tmp.  Otherwise, merge into key */
     533             : 
     534             :   /* Note that in_l does not overlap with out at this point.  in_r might
     535             :      overlap with the right half of out but the merge pass is fine for
     536             :      that case. */
     537             : 
     538     2659872 :   SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
     539             : 
     540     2659872 :   return out;
     541     6340224 : }
     542             : 
     543             : /* This uses a dual pivot quick sort for better theoretical and
     544             :    practical mojo. */
     545             : 
     546             : SORT_FN_ATTR SORT_KEY_T *
     547             : SORT_(private_quick)( SORT_KEY_T * key,
     548     4699236 :                       SORT_IDX_T   cnt ) {
     549     4699236 :   SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
     550     4699236 :   ulong      stack_cnt = 0UL;
     551             : 
     552     4699236 :   stack[ stack_cnt++ ] = (SORT_IDX_T)0;
     553     4699236 :   stack[ stack_cnt++ ] = cnt;
     554    12226593 :   for(;;) {
     555             : 
     556             :     /* Pop the next partition to process */
     557             : 
     558    12226593 :     if( !stack_cnt ) return key; /* All done */
     559     7527357 :     SORT_IDX_T h = stack[ --stack_cnt ];
     560     7527357 :     SORT_IDX_T l = stack[ --stack_cnt ];
     561             : 
     562             :     /* [l,h) is the partition we are sorting.  If this partition is
     563             :        small enough, sort it via insertion sort. */
     564             : 
     565     7527357 :     SORT_IDX_T n = h-l;
     566     7527357 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
     567     4145556 :       SORT_(insert)( key+l, n );
     568     4145556 :       continue;
     569     4145556 :     }
     570             : 
     571             :     /* This partition is too large to insertion sort.  Pick two pivots,
     572             :        sort the partition into a left, center and right partition and
     573             :        then recursively sort those partitions.  We initially use a
     574             :        simple choice of pivots that is ideal for nearly sorted data (in
     575             :        either direction) with a little extra sampling to improve the
     576             :        partitioning for randomly ordered data.  There is a possibility
     577             :        that this deterministic choice of pivots will not succeed in
     578             :        making two or more non-empty partitions; we detect and correct
     579             :        that below. */
     580             : 
     581             :     /* Initial pivot selection */
     582             : 
     583     3381801 :     SORT_KEY_T p1;
     584     3381801 :     SORT_KEY_T p2;
     585     3381801 :     do {
     586     3381801 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
     587     3381801 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     588     3381801 :       SORT_KEY_T p0 = key[ l       ];
     589     3381801 :       /**/       p1 = key[ l  + n3 ];
     590     3381801 :       /**/       p2 = key[ h1 - n3 ];
     591     3381801 :       SORT_KEY_T p3 = key[ h1      ];
     592     3381801 : #     if SORT_QUICK_ORDER_STYLE==0
     593             :       /* Generates better code for integral types (branchless via stack) */
     594     3381801 :       SORT_KEY_T _k[2]; ulong _c;
     595    16909005 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     596             : #     else
     597             :       /* Generates much better code for floating point types (branchless
     598             :          via min / max floating point instructions) */
     599             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     600             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     601             : #     endif
     602             : 
     603     3381801 :       ORDER(p0,p2); ORDER(p1,p3);
     604     3381801 :       ORDER(p0,p1); ORDER(p2,p3);
     605     3381801 :       ORDER(p1,p2);
     606     3381801 : #     undef ORDER
     607     3381801 :     } while(0);
     608             : 
     609     3770133 :   retry: /* Three way partitioning (silly language restriction) */;
     610     3770133 :     SORT_IDX_T i = l;
     611     3770133 :     SORT_IDX_T j = l;
     612     3770133 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     613   537890262 :     do { /* Note [j,k] is non-empty (look ma ... no branches!) */
     614             : 
     615             :       /* At this point, p1 <= p2 and:
     616             :          - keys [l,i) are before p1 (left partition)
     617             :          - keys [i,j) are in [p1,p2] (center partition)
     618             :          - keys [j,k] are not yet partitioned (region is non-empty)
     619             :          - keys (k,h) are after p2 (right partition)
     620             :          Decide where key j should be moved. */
     621             : 
     622   537890262 :       SORT_KEY_T kj = key[j];
     623             : 
     624   537890262 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     625   537890262 :       int to_right = (SORT_BEFORE( p2, kj ));
     626   537890262 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     627             : 
     628             : #     if SORT_QUICK_SWAP_MINIMIZE
     629             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
     630             : #     else
     631             :       /**/                      key[j] = key[m]; key[m] = kj;
     632   537890262 : #     endif
     633             : 
     634   537890262 :       i += (SORT_IDX_T) to_left;
     635   537890262 :       j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
     636   537890262 :       k -= (SORT_IDX_T) to_right;
     637   537890262 :     } while( j<=k );
     638             : 
     639             :     /* Schedule recursion */
     640             : 
     641     3770133 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     642             : 
     643             :       /* All the keys ended up in the center partition.  If p1<p2, we
     644             :          had an unlucky choice of pivots such that p1==min(key[i]) and
     645             :          p2==max(key[i]) for i in [l,h).  Since we picked the keys
     646             :          deterministically above, we would have the same bad luck the
     647             :          next time we processed this partition.  But, because there are
     648             :          at least two distinct elements in the partition, there is a
     649             :          choice of pivots that will make a non-trivial partition, so we
     650             :          need to redo this partition with different pivots.
     651             : 
     652             :          Randomly picking a new pivot pair in [l,h) has probability
     653             :          between ~50% and ~100%.  The worst case is half the keys in
     654             :          the partition equal to p1 and the other half equal to p2; the
     655             :          best case exactly one key is equal to p1 / p2 and all other
     656             :          keys are in (p1,p2] / [p1,p2).
     657             : 
     658             :          The optimal handling of the worst case though is just to set
     659             :          p1=p2 (p2=p1).  This will pull the keys equal to p1 (p2) into
     660             :          the left (right) partition and the remaining keys into the
     661             :          center partition; the right (left) partition will be empty.
     662             :          Thus, this is guaranteed to yield a non-trivial partition of
     663             :          the center partition but may not yield as load balanced set of
     664             :          partitions as random for the next iteration.  We go with the
     665             :          deterministic option for simplicity below. */
     666             : 
     667     1550499 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     668             : 
     669             :       /* p1==p2 here ... all keys in this partition are the same
     670             :          We don't need to recurse further in this partition. */
     671             : 
     672     2219634 :     } else {
     673             : 
     674             :       /* Between [1,n-1] keys ended up in the center partition such that
     675             :          at least 1 key ended up in another partition.  We push all
     676             :          three partitions onto the partition stack for recursion.  The
     677             :          order of the pushes is important to bound the maximum stack
     678             :          usage.  Below, we push the center partition followed by the
     679             :          larger of the left and right partitions then followed by the
     680             :          smaller of the left and right partitions (such that we process
     681             :          the smallest next iteration).  For this, the same O(log_2 cnt)
     682             :          max recursion depth applies as quicksort even this does a three
     683             :          way partitioning.  That is, let fl/fc/fr be the fraction of
     684             :          keys in the left, center, right partitions.  At this point, fl
     685             :          is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
     686             :          The smaller of the left or right partition is what process next
     687             :          and, as the smaller of the left or right, it fraction of the
     688             :          keys is at most 0.5(1-fc)<=0.5.  We do push up to two
     689             :          partitions onto the stack (four SORT_IDX_T) each level of
     690             :          recursion though instead of one like normal quick sort though. */
     691             : 
     692     2219634 :       int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
     693     2219634 :       SORT_IDX_T l_larger  = SORT_IDX_IF( left_larger, l, j );
     694     2219634 :       SORT_IDX_T h_larger  = SORT_IDX_IF( left_larger, i, h );
     695     2219634 :       SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
     696     2219634 :       SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
     697             : 
     698             :       /* Immediately process empty partitions */
     699     2219634 :       ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
     700             : 
     701             :       /* Immediately process partitions where all keys are the same */
     702     2219634 :       ulong push_center  = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
     703             : 
     704             :       /* Guaranteed non-empty (larger of left or right have at least 1 key) */
     705     2219634 :       ulong push_larger  = 1UL;
     706             : 
     707     2219634 :       stack[ stack_cnt ] = l_larger;  stack_cnt += push_larger;
     708     2219634 :       stack[ stack_cnt ] = h_larger;  stack_cnt += push_larger;
     709     2219634 :       stack[ stack_cnt ] = i;         stack_cnt += push_center;
     710     2219634 :       stack[ stack_cnt ] = j;         stack_cnt += push_center;
     711     2219634 :       stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
     712     2219634 :       stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
     713     2219634 :     }
     714     3770133 :   }
     715             :   /* never get here */
     716     4699236 : }
     717             : 
     718             : /* This works identical to sort_private_quick.  Only differences
     719             :    relevant to selection are commented (in short, we only recurse on the
     720             :    partition that could contain rank).  See above for comments on the
     721             :    algo. */
     722             : 
     723             : SORT_FN_ATTR SORT_KEY_T *
     724             : SORT_(private_select)( SORT_KEY_T * key,
     725             :                        SORT_IDX_T   cnt,
     726      231484 :                        SORT_IDX_T   rnk ) {
     727      231484 :   SORT_IDX_T l = (SORT_IDX_T)0;
     728      231484 :   SORT_IDX_T h = cnt;
     729      748693 :   for(;;) {
     730             : 
     731             :     /* The partition [l,h) contains rnk.  If this partition is small
     732             :        enough, sort it via insertion sort.  FIXME: probably could
     733             :        truncate the insertion sort early for further optimization (e.g.
     734             :        if rnk==l / rnk==h-1, we only need to find the min/max for O(n)
     735             :        operation). */
     736             : 
     737      748693 :     SORT_IDX_T n = h-l;
     738      748693 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
     739             : 
     740      517233 :     SORT_KEY_T p1;
     741      517233 :     SORT_KEY_T p2;
     742      517233 :     do {
     743      517233 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3;
     744      517233 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     745      517233 :       SORT_KEY_T p0 = key[ l       ];
     746      517233 :       /**/       p1 = key[ l  + n3 ];
     747      517233 :       /**/       p2 = key[ h1 - n3 ];
     748      517233 :       SORT_KEY_T p3 = key[ h1      ];
     749      517233 : #     if SORT_QUICK_ORDER_STYLE==0
     750      517233 :       SORT_KEY_T _k[2]; ulong _c;
     751     2586165 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     752             : #     else
     753             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     754             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     755             : #     endif
     756             : 
     757      517233 :       ORDER(p0,p2); ORDER(p1,p3);
     758      517233 :       ORDER(p0,p1); ORDER(p2,p3);
     759      517233 :       ORDER(p1,p2);
     760      517233 : #     undef ORDER
     761      517233 :     } while(0);
     762             : 
     763      517233 :   retry: /* Silly language restriction */;
     764      517233 :     SORT_IDX_T i = l;
     765      517233 :     SORT_IDX_T j = l;
     766      517233 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     767    57770702 :     do {
     768             : 
     769    57770702 :       SORT_KEY_T kj = key[j];
     770             : 
     771    57770702 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     772    57770702 :       int to_right = (SORT_BEFORE( p2, kj ));
     773    57770702 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     774             : 
     775             : #     if SORT_QUICK_SWAP_MINIMIZE
     776             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
     777             : #     else
     778             :       /**/                      key[j] = key[m]; key[m] = kj;
     779    57770702 : #     endif
     780             : 
     781    57770702 :       i += (SORT_IDX_T) to_left;
     782    57770702 :       j += (SORT_IDX_T)(to_right^1);
     783    57770702 :       k -= (SORT_IDX_T) to_right;
     784    57770702 :     } while( j<=k );
     785             : 
     786      517233 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     787             : 
     788          24 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     789             : 
     790             :       /* p1==p2 here ... all keys in this partition are the same.  We
     791             :          don't need to recurse further in this partition as all keys
     792             :          would do here. */
     793             : 
     794          24 :       return key;
     795          24 :     }
     796             : 
     797             :     /* At this point:
     798             :        - [l,i) is the left partition,
     799             :        - [i,j) is the center partition
     800             :        - [j,h) is the right partition
     801             :        Between [1,n-1] keys ended up in the center partition such that
     802             :        at least 1 key ended up in another partition.  Recurse on the
     803             :        partition that contains rnk.  As this is typically used in median
     804             :        finding, we assume that the center partition is the most likely
     805             :        in the below selection (this can also be done branchlessly). */
     806             : 
     807      517209 :     SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
     808      517209 :     if( FD_UNLIKELY( rnk< i   ) ) l_next = l, h_next = i;
     809      517209 :     if( FD_UNLIKELY( j  <=rnk ) ) l_next = j, h_next = h;
     810      517209 :     l = l_next; h = h_next;
     811      517209 :   }
     812             :   /* never get here */
     813      231484 : }
     814             : 
     815             : #if SORT_PARALLEL
     816             : 
     817             : /* This works identical to sort_private_quick.  Only differences
     818             :    relevant to parallelization are commented.  See above for comments on
     819             :    the algo. */
     820             : 
     821             : SORT_FN_ATTR void
     822             : SORT_(private_quick_node)( void * _tpool,
     823             :                            ulong  t0,      ulong t1,
     824             :                            void * _args,
     825             :                            void * _reduce, ulong _stride,
     826             :                            ulong  _l0,     ulong _l1,
     827             :                            ulong  _m0,     ulong _m1,
     828           0 :                            ulong  _n0,     ulong _n1 ) {
     829           0 :   (void)_stride; (void)_l0; (void)_l1; (void)_m0; (void)_m1; (void)_n0; (void)_n1;
     830             : 
     831           0 :   fd_tpool_t * tpool = (fd_tpool_t *)     _tpool;
     832           0 :   SORT_KEY_T * key   = (SORT_KEY_T *)     _args;
     833           0 :   SORT_IDX_T   cnt   = (SORT_IDX_T)(ulong)_reduce;
     834             : 
     835           0 :   SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ];
     836           0 :   ulong      stack_cnt = 0UL;
     837             : 
     838           0 :   ulong wait_stack[ 82 ]; /* See note below for sizing considerations */
     839           0 :   ulong wait_stack_cnt = 0UL;
     840             : 
     841           0 :   stack[ stack_cnt++ ] = (SORT_IDX_T)0;
     842           0 :   stack[ stack_cnt++ ] = cnt;
     843             : 
     844           0 :   while( stack_cnt ) {
     845             : 
     846           0 :     SORT_IDX_T h = stack[ --stack_cnt ];
     847           0 :     SORT_IDX_T l = stack[ --stack_cnt ];
     848             : 
     849           0 :     SORT_IDX_T n = h-l;
     850           0 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
     851           0 :       SORT_(insert)( key+l, n );
     852           0 :       continue;
     853           0 :     }
     854             : 
     855           0 :     SORT_KEY_T p1;
     856           0 :     SORT_KEY_T p2;
     857           0 :     do {
     858           0 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3;
     859           0 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     860           0 :       SORT_KEY_T p0 = key[ l       ];
     861           0 :       /**/       p1 = key[ l  + n3 ];
     862           0 :       /**/       p2 = key[ h1 - n3 ];
     863           0 :       SORT_KEY_T p3 = key[ h1      ];
     864           0 : #     if SORT_QUICK_ORDER_STYLE==0
     865           0 :       SORT_KEY_T _k[2]; ulong _c;
     866           0 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     867             : #     else
     868             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     869             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     870             : #     endif
     871             : 
     872           0 :       ORDER(p0,p2); ORDER(p1,p3);
     873           0 :       ORDER(p0,p1); ORDER(p2,p3);
     874           0 :       ORDER(p1,p2);
     875           0 : #     undef ORDER
     876           0 :     } while(0);
     877             : 
     878           0 :   retry: /* Silly language restriction */;
     879           0 :     SORT_IDX_T i = l;
     880           0 :     SORT_IDX_T j = l;
     881           0 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     882           0 :     do {
     883           0 :       SORT_KEY_T kj = key[j];
     884             : 
     885           0 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     886           0 :       int to_right = (SORT_BEFORE( p2, kj ));
     887           0 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     888             : 
     889             : #     if SORT_QUICK_SWAP_MINIMIZE
     890             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; }
     891             : #     else
     892             :       /**/                      key[j] = key[m]; key[m] = kj;
     893           0 : #     endif
     894             : 
     895           0 :       i += (SORT_IDX_T) to_left;
     896           0 :       j += (SORT_IDX_T)(to_right^1);
     897           0 :       k -= (SORT_IDX_T) to_right;
     898           0 :     } while( j<=k );
     899             : 
     900           0 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     901             : 
     902           0 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     903             : 
     904           0 :     } else {
     905             : 
     906             :       /* At this point, we have at most 3 partitions to sort and can use
     907             :          the caller and tpool threads (t0,t1) to sort them.  To load
     908             :          balance this, we sort the partitions by an estimate of how much
     909             :          work each requires.  We estimate this as proportional to:
     910             : 
     911             :            work = cnt ceil log_3 cnt if cnt>0 and 0 otherwise
     912             : 
     913             :          where cnt is the number of keys in the partition.  Since this
     914             :          is a rough estimate used for load balancing, we don't need a
     915             :          precise log_3 calculation.  With log_3 x ~ 0.633 lg x, we
     916             :          approximate:
     917             : 
     918             :            ceil log_3 x ~ ceil( 5 (lg x) / 8 )
     919             :                         ~ ceil( 5 (floor lg x) / 8)
     920             :                         = floor( (5 (floor lg x) + 7)/8 )
     921             :                         = (5 (msb x) + 7) >> 3
     922             : 
     923             :          This approximation is trivial to compute, is monotonic and is
     924             :          either exact or 1 less for x in (0,2^64).  The approximation is
     925             :          such that cnt==1 / cnt>1 yields zero / positive.  As such,
     926             :          work==0 strictly indicates no additional work is necessary. */
     927             : 
     928           0 : #     define WORK_APPROX(cnt) (((ulong)(cnt))*(ulong)((5*fd_ulong_find_msb_w_default( (ulong)(cnt), 0 )+7) >> 3))
     929             : 
     930           0 :       struct { SORT_IDX_T l; SORT_IDX_T h; ulong work; } part[3];
     931             : 
     932           0 :       part[0].l = l; part[0].h = i; part[0].work =                                     WORK_APPROX( i-l );
     933           0 :       part[1].l = i; part[1].h = j; part[1].work = fd_ulong_if( SORT_BEFORE( p1, p2 ), WORK_APPROX( j-i ), 0UL );
     934           0 :       part[2].l = j; part[2].h = h; part[2].work =                                     WORK_APPROX( h-j );
     935             : 
     936           0 : #     undef WORK_APPROX
     937             : 
     938           0 :       ulong work_remain = part[0].work + part[1].work + part[2].work;
     939             : 
     940           0 : #     define ORDER(i,j) fd_swap_if( part[j].work > part[i].work, part[i], part[j] )
     941             : 
     942           0 :       ORDER(0,2); ORDER(0,1); ORDER(1,2);
     943             : 
     944           0 : #     undef ORDER
     945             : 
     946           0 :       for( ulong idx=0UL; idx<3UL; idx++ ) {
     947             : 
     948             :         /* At this point, we need to schedule partitions [idx,2] requiring
     949             :            work_remain total estimated work and we have the caller and
     950             :            tpool threads (t0,t1) to do this work.  Compute which threads
     951             :            should execute partition idx. */
     952             : 
     953           0 :         ulong p_work = part[idx].work;
     954           0 :         if( FD_UNLIKELY( !p_work ) ) break; /* no work need for this partition and remainin partitions */
     955             : 
     956           0 :         SORT_IDX_T p_l = part[idx].l;
     957           0 :         SORT_IDX_T p_h = part[idx].h;
     958             : 
     959           0 :         ulong t_cnt = t1 - t0;
     960           0 :         if( FD_UNLIKELY( (t_cnt>1UL) & (p_work<work_remain) ) ) {
     961             : 
     962             :           /* At this point, we have at least two partitions that need
     963             :              work to sort and at least two threads remaining for
     964             :              sorting.  Schedule this partition for execution on between
     965             :              1 and t_cnt-1 of the remaining threads roughly proportional
     966             :              to the estimated work to sort this partition versus the
     967             :              total estimated work remaining.
     968             : 
     969             :              For the wait stack, in the worst case, we schedule two
     970             :              partitions for remote execution covering two thirds of the
     971             :              keys each round.  This implies the wait stack depth is
     972             :              bounded by:
     973             : 
     974             :                2 ceil log_3 thread_cnt
     975             : 
     976             :              Practically, thread_cnt<=FD_TILE_MAX.  For
     977             :              FD_TILE_MAX==1024, 14 is sufficient and 82 is sufficient
     978             :              for any thread_cnt representable by a ulong. */
     979             : 
     980           0 :           ulong ts = t1 - fd_ulong_min( fd_ulong_max( (t_cnt*p_work + (work_remain>>1)) / work_remain, 1UL ), t_cnt-1UL );
     981             : 
     982           0 :           fd_tpool_exec( tpool, ts, SORT_(private_quick_node), tpool, ts, t1, key + p_l, (void *)(ulong)(p_h - p_l),
     983           0 :                          0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL );
     984             : 
     985           0 :           wait_stack[ wait_stack_cnt++ ] = ts;
     986             : 
     987             :           /* Advance t1 and work_remain for the next iteration */
     988             : 
     989           0 :           t1           = ts;
     990           0 :           work_remain -= p_work;
     991             : 
     992           0 :         } else {
     993             : 
     994             :           /* At this point, we have only one thread available and/or
     995             :              this is the only partition remaining that needs sorting.
     996             :              Schedule this partition for local execution. */
     997             : 
     998           0 :           stack[ stack_cnt++ ] = p_l;
     999           0 :           stack[ stack_cnt++ ] = p_h;
    1000             : 
    1001           0 :         }
    1002           0 :       }
    1003           0 :     }
    1004           0 :   }
    1005             : 
    1006             :   /* Wait for any children to finish */
    1007             : 
    1008           0 :   while( wait_stack_cnt ) fd_tpool_wait( tpool, wait_stack[ --wait_stack_cnt ] );
    1009           0 : }
    1010             : 
    1011           0 : FD_MAP_REDUCE_BEGIN( SORT_(private_merge_para), 1L, 0UL, sizeof(SORT_KEY_T *), 1L ) {
    1012             : 
    1013           0 :   SORT_KEY_T ** out = (SORT_KEY_T **)arg[0];
    1014           0 :   SORT_KEY_T *  key = (SORT_KEY_T * )arg[1];
    1015           0 :   SORT_KEY_T *  tmp = (SORT_KEY_T * )arg[2];
    1016             : 
    1017           0 :   *out = SORT_(stable_fast)( key + block_i0, (SORT_IDX_T)block_cnt, tmp + block_i0 );
    1018             : 
    1019           0 : } FD_MAP_END {
    1020             : 
    1021           0 :   SORT_KEY_T *  key   = (SORT_KEY_T * )arg[1];
    1022           0 :   SORT_KEY_T *  tmp   = (SORT_KEY_T * )arg[2];
    1023           0 :   SORT_KEY_T ** _in_l = (SORT_KEY_T **)arg[0]; SORT_KEY_T * in_l = *_in_l; long cnt_l = block_is - block_i0;
    1024           0 :   SORT_KEY_T ** _in_r = (SORT_KEY_T **)_r1;    SORT_KEY_T * in_r = *_in_r; long cnt_r = block_i1 - block_is;
    1025             : 
    1026             :   /* Merge in_l / in_r (see private_merge above for details) */
    1027             : 
    1028           0 :   SORT_KEY_T * out = ((in_l==(key+block_i0)) ? tmp : key) + block_i0; /* cmov */
    1029             : 
    1030           0 :   SORT_(private_merge_pass)( in_l, cnt_l, in_r, cnt_r, out );
    1031             : 
    1032           0 :   *_in_l = out;
    1033             : 
    1034           0 : } FD_REDUCE_END
    1035             : 
    1036           0 : FD_FOR_ALL_BEGIN( SORT_(private_memcpy_para), 1L ) {
    1037             : 
    1038           0 :   SORT_KEY_T       * dst = (SORT_KEY_T       *)arg[0];
    1039           0 :   SORT_KEY_T const * src = (SORT_KEY_T const *)arg[1];
    1040             : 
    1041           0 :   if( FD_LIKELY( block_cnt ) ) memcpy( dst + block_i0, src + block_i0, sizeof(SORT_KEY_T)*(ulong)block_cnt );
    1042             : 
    1043           0 : } FD_FOR_ALL_END
    1044             : 
    1045             : #if FD_HAS_ALLOCA
    1046             : 
    1047           0 : static FD_FOR_ALL_BEGIN( SORT_(private_cntcpy_para), 1L ) {
    1048           0 :   ulong              tpool_base = (ulong             )arg[0];
    1049           0 :   ulong              t_cnt      = (ulong             )arg[1];
    1050           0 :   ulong            * _key_cnt   = (ulong            *)arg[2];
    1051           0 :   SORT_KEY_T const * key        = (SORT_KEY_T const *)arg[3];
    1052           0 :   SORT_KEY_T       * tmp        = (SORT_KEY_T       *)arg[4];
    1053           0 :   SORT_KEY_T const * pivot      = (SORT_KEY_T const *)arg[5];
    1054             : 
    1055             :   /* Note keys in [ pivot[t-1], pivot[t] ) for t in [0,t_cnt) are keys
    1056             :      assigned to thread t for sorting.  pivot[-1] / pivot[t_cnt-1] are
    1057             :      an implied -infinity / +infinity and never explicitly accessed.  As
    1058             :      such, pivot is indexed [0,t_cnt-1). */
    1059             : 
    1060             :   /* Allocate and clear a local scratch for counting.  We don't count
    1061             :      directly into key_cnt to avoid false sharing while counting. */
    1062             : 
    1063           0 :   ulong * key_cnt = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
    1064             : 
    1065           0 :   memset( key_cnt, 0, sizeof(ulong)*t_cnt );
    1066             : 
    1067           0 :   for( long i=block_i0; i<block_i1; i++ ) {
    1068           0 :     SORT_KEY_T ki = key[i];
    1069             : 
    1070             :     /* Determine which thread is responsible for subsorting this key */
    1071             :     /* FIXME: ideally, use a unrolled fixed count iteration for here */
    1072             : 
    1073           0 :     ulong l = 0UL;
    1074           0 :     ulong h = t_cnt;
    1075           0 :     for(;;) {
    1076           0 :       ulong n = h - l;
    1077           0 :       if( n<2UL ) break;
    1078             : 
    1079             :       /* At this point, the thread for key i is in [l,h) and this range
    1080             :          contains at least two threads.  Split this range in half. */
    1081             : 
    1082           0 :       ulong m = l + (n>>1); /* In [1,t_cnt) */
    1083           0 :       int   c = SORT_BEFORE( ki, pivot[m-1UL] );
    1084             : 
    1085             :       /* If ki is before pivot[m], the target thread is in [l,m).
    1086             :          Otherwise, the target thread is in [m,h). */
    1087             : 
    1088           0 :       l = fd_ulong_if( c, l, m );
    1089           0 :       h = fd_ulong_if( c, m, h );
    1090           0 :     }
    1091             : 
    1092             :     /* At this point, key[i] should be assigned to thread l.  Count it
    1093             :        and copy it into tmp to prepare to scatter. */
    1094             : 
    1095           0 :     key_cnt[l]++;
    1096           0 :     tmp[i] = ki;
    1097           0 :   }
    1098             : 
    1099             :   /* Send the counts back to main thread for partitioning */
    1100             : 
    1101           0 :   memcpy( _key_cnt + (tpool_t0-tpool_base)*t_cnt, key_cnt, sizeof(ulong)*t_cnt );
    1102             : 
    1103           0 : } FD_FOR_ALL_END
    1104             : 
    1105           0 : static FD_FOR_ALL_BEGIN( SORT_(private_scatter_para), 1L ) {
    1106           0 :   ulong              tpool_base = (ulong             )arg[0];
    1107           0 :   ulong              t_cnt      = (ulong             )arg[1];
    1108           0 :   ulong      const * _part      = (ulong      const *)arg[2];
    1109           0 :   SORT_KEY_T       * key        = (SORT_KEY_T       *)arg[3];
    1110           0 :   SORT_KEY_T const * tmp        = (SORT_KEY_T const *)arg[4];
    1111           0 :   SORT_KEY_T const * pivot      = (SORT_KEY_T const *)arg[5];
    1112             : 
    1113             :   /* Receive the key array partitioning from the main thread.  Like the
    1114             :      above, we don't operate on the part array directly to avoid false
    1115             :      sharing while scattering. */
    1116             : 
    1117           0 :   ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt );
    1118             : 
    1119           0 :   memcpy( part, _part + (tpool_t0-tpool_base)*t_cnt, sizeof(ulong)*t_cnt );
    1120             : 
    1121           0 :   for( long i=block_i0; i<block_i1; i++ ) {
    1122           0 :     SORT_KEY_T ki = tmp[i];
    1123             : 
    1124             :     /* Determine which thread is responsible for subsort this key
    1125             :        (again).  Identical considerations to the above.  Note: we can
    1126             :        eliminate this computation if willing to burn O(N) additional
    1127             :        scratch memory by saving results computed above for use here. */
    1128             : 
    1129           0 :     ulong l = 0UL;
    1130           0 :     ulong h = t_cnt;
    1131           0 :     for(;;) {
    1132           0 :       ulong n = h - l;
    1133           0 :       if( n<2UL ) break;
    1134           0 :       ulong m = l + (n>>1);
    1135           0 :       int   c = SORT_BEFORE( ki, pivot[m-1UL] );
    1136           0 :       l = fd_ulong_if( c, l, m );
    1137           0 :       h = fd_ulong_if( c, m, h );
    1138           0 :     }
    1139             : 
    1140             :     /* Send this key to target thread */
    1141             : 
    1142           0 :     key[ part[l]++ ] = ki;
    1143           0 :   }
    1144             : 
    1145           0 : } FD_FOR_ALL_END
    1146             : 
    1147           0 : static FD_FOR_ALL_BEGIN( SORT_(private_subsort_para), 1L ) {
    1148           0 :   ulong const * part   = (ulong const *)arg[0];
    1149           0 :   SORT_KEY_T  * key    = (SORT_KEY_T  *)arg[1];
    1150           0 :   SORT_KEY_T  * tmp    = (SORT_KEY_T  *)arg[2];
    1151           0 :   int           stable = (int          )arg[3];
    1152             : 
    1153           0 :   ulong j0 = part[ block_i0 ];
    1154           0 :   ulong j1 = part[ block_i1 ];
    1155             : 
    1156           0 :   if( stable ) SORT_(stable) ( key + j0, j1 - j0, tmp + j0 );
    1157           0 :   else         SORT_(inplace)( key + j0, j1 - j0 );
    1158             : 
    1159           0 : } FD_FOR_ALL_END
    1160             : 
    1161             : SORT_FN_ATTR SORT_KEY_T *
    1162             : SORT_(fast_para)( fd_tpool_t * tpool, ulong t0, ulong t1,
    1163             :                   SORT_KEY_T * key,
    1164             :                   SORT_IDX_T   cnt,
    1165             :                   void *       scratch,
    1166             :                   ulong        seed,
    1167           0 :                   int          stable ) {
    1168             : 
    1169           0 :   if( FD_UNLIKELY( cnt<(SORT_IDX_T)2 ) ) return key; /* nothing to do */
    1170             : 
    1171             :   /* For the below (if sampling ops are fully threaded), sampling costs
    1172             :      O(S), the sample sort costs O(S lg(T S)), the downsampling costs
    1173             :      O(1), the partitioning costs O(T), the counting and scattering cost
    1174             :      O(N (lg T) / T), the subsorts cost (N/T) lg(N/T) and thread
    1175             :      synchronization overhead is O(lg T).
    1176             : 
    1177             :      Combining all these, assuming the sampling ratio S is a fixed O(1)
    1178             :      quantity and assuming N (lg T) / T term in the counting/scattering
    1179             :      approximately cancals the -N lg T / T in the subsorts yields a
    1180             :      wallclock that scales as:
    1181             : 
    1182             :        alpha N (ln N) / T + beta ln T + gamma T
    1183             : 
    1184             :      The first term represents the parallelized sorting cost, the second
    1185             :      term represents the thread dispatch / sync cost, and the last term
    1186             :      represents the partitioning costs.  Minimizing cost with respect
    1187             :      to T yields:
    1188             : 
    1189             :        -alpha (N ln N) / T_opt^2 + beta / T_opt + gamma = 0
    1190             : 
    1191             :      whose (positive) solution is:
    1192             : 
    1193             :        T_opt ~ (0.5 beta / gamma) [ sqrt( 1 + (4 alpha gamma N ln N)/beta^2) ) - 1 ]
    1194             : 
    1195             :      In the limit 4 alpha gamma N ln N / beta^2 >> 1 (that is,
    1196             :      partitioning costs are much lower than thread start/stop costs), we
    1197             :      have:
    1198             : 
    1199             :        T_opt -> (alpha N ln N) / beta
    1200             : 
    1201             :      In the other limit, we have:
    1202             : 
    1203             :        T_opt -> sqrt( alpha N ln N / gamma )
    1204             : 
    1205             :      The first limit tends to apply in practice.  Thus we use:
    1206             : 
    1207             :        T_opt ~ (N lg N) / thresh
    1208             :              ~ floor( (N msb N + N/2 + thresh/2) / thresh)
    1209             : 
    1210             :      where thresh is an empirical minimum amount of sorting work to
    1211             :      justify starting / stopping a thread.  (As written below, the gamma
    1212             :      term is more like gamma T^2, which makes the other limit more like
    1213             :      the cube root of N ln N but doesn't change the overall
    1214             :      conclusion.) */
    1215             : 
    1216           0 :   ulong thresh = (4096UL + sizeof(SORT_KEY_T)-1UL) / sizeof(SORT_KEY_T);
    1217           0 :   ulong t_cnt  = fd_ulong_min( t1 - t0, (cnt*(ulong)fd_ulong_find_msb( cnt ) + ((cnt+thresh)>>1)) / thresh );
    1218           0 :   if( FD_UNLIKELY( t_cnt<2UL ) ) return stable ? SORT_(stable)( key, cnt, scratch ) : SORT_(inplace)( key, cnt );
    1219           0 :   t1 = t0 + t_cnt;
    1220             : 
    1221             :   /* At this point, we have at least 2 threads available and at least 2
    1222             :      items to sort.  Sample the keys to get some idea of their
    1223             :      distribution, sort the samples and  downsample the sorted samples
    1224             :      into pivots that approximately uniformly partition the samples into
    1225             :      t_cnt groups (and thus approximately partition the keys uniformly
    1226             :      too).  Notes:
    1227             : 
    1228             :      - The sampling below is practically uniform IID but it could be
    1229             :        made more robust with a rejection method (ala fd_rng_ulong_roll).
    1230             : 
    1231             :      - Increasing SORT_OVERSAMPLE_RATIO improves the uniformity of the
    1232             :        partitioning of key space over the inputs but requires more stack
    1233             :        scratch memory and more overhead.
    1234             : 
    1235             :      - All three steps here could be parallelized but it is probably not
    1236             :        worth it given the overhead for threaded versus the number of
    1237             :        samples.
    1238             : 
    1239             :      - For a stable sort, the result is completely deterministic even
    1240             :        if the seed provided is non-deterministic. */
    1241             : 
    1242           0 :   ulong        sample_cnt = t_cnt*(ulong)SORT_OVERSAMPLE_RATIO;
    1243           0 :   SORT_KEY_T * pivot      = fd_alloca( alignof(ulong), sizeof(ulong)*sample_cnt );
    1244             : 
    1245           0 :   for( ulong i=0UL; i<sample_cnt; i++ ) pivot[i] = key[ fd_ulong_hash( seed ^ i ) % cnt ];
    1246             : 
    1247           0 :   SORT_(inplace)( pivot, sample_cnt );
    1248             : 
    1249           0 :   for( ulong i=1UL; i<t_cnt; i++ ) pivot[i-1UL] = pivot[ i*(ulong)SORT_OVERSAMPLE_RATIO ];
    1250             : 
    1251             :   /* At this point, keys in [ pivot[t-1], pivot[t] ) should be assigned
    1252             :      to thread t for thread t's subsort.  pivot[-1] / pivot[t_cnt-1] are
    1253             :      an implicit -infinity / +infinity.  Split the input array equally
    1254             :      over threads and have each thread copy their block of keys into tmp
    1255             :      and count the number of keys in its block that could be assigned to
    1256             :      each thread. */
    1257             : 
    1258           0 : # define part(i,j) part[ (i) + t_cnt*(j) ]
    1259             : 
    1260           0 :   ulong * part = fd_alloca( alignof(ulong), sizeof(ulong)*t_cnt*t_cnt );
    1261             : 
    1262           0 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
    1263             : 
    1264           0 :   FD_FOR_ALL( SORT_(private_cntcpy_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
    1265             : 
    1266             :   /* At this point, part(i,j) is the count of the number of keys in thread
    1267             :      j's block that were assigned to thread i's subsort.  Convert this
    1268             :      into a partitioning.  In-principle this can be parallelized but
    1269             :      this is rarely worth it practically. */
    1270             : 
    1271           0 :   ulong k = 0UL;
    1272           0 :   for( ulong i=0UL; i<t_cnt; i++ ) {
    1273           0 :     for( ulong j=0UL; j<t_cnt; j++ ) {
    1274           0 :       ulong c_ij = part(i,j);
    1275           0 :       part(i,j) = k;
    1276           0 :       k += c_ij;
    1277           0 :     }
    1278           0 :   }
    1279             : 
    1280             :   /* At this point, the range [ part(i,j), part(i+1,j) ) is where keys
    1281             :      in thread j's block assigned to thread i's subsort should be
    1282             :      scattered.  part(t_cnt,t_cnt-1) is an implied cnt.  Scatter the
    1283             :      keys from tmp back into key in subsorted order. */
    1284             : 
    1285           0 :   FD_FOR_ALL( SORT_(private_scatter_para), tpool,t0,t1, 0L,(long)cnt, t0, t_cnt, part, key, tmp, pivot );
    1286             : 
    1287             :   /* At this point, keys [ part(i,0), part(i+1,0) ) are the keys assigned
    1288             :      to thread i for subsorting.  part(t_cnt,0) is an implied cnt.
    1289             :      Since t_cnt>1 though this location exists.  We make that explicit
    1290             :      such that part[i],part[i+1] give the sets of keys each thread
    1291             :      should sort.  Do the thread parallel subsorts to get the keys into
    1292             :      final sorted order. */
    1293             : 
    1294           0 :   part[ t_cnt ] = cnt;
    1295             : 
    1296           0 :   FD_FOR_ALL( SORT_(private_subsort_para), tpool,t0,t1, 0L,(long)t_cnt, part, key, tmp, stable );
    1297             : 
    1298           0 : # undef part
    1299             : 
    1300           0 :   return key;
    1301           0 : }
    1302             : 
    1303             : #endif /* FD_HAS_ALLOCA */
    1304             : #endif /* SORT_PARALLEL */
    1305             : #endif /* SORT_IMPL_STYLE!=1 */
    1306             : 
    1307             : #undef SORT_
    1308             : #undef SORT_STATIC
    1309             : 
    1310             : #undef SORT_IMPL_STYLE
    1311             : #undef SORT_FN_ATTR
    1312             : #undef SORT_IDX_IF
    1313             : #undef SORT_OVERSAMPLE_RATIO
    1314             : #undef SORT_PARALLEL
    1315             : #undef SORT_QUICK_SWAP_MINIMIZE
    1316             : #undef SORT_QUICK_ORDER_STYLE
    1317             : #undef SORT_QUICK_THRESH
    1318             : #undef SORT_MERGE_THRESH
    1319             : #undef SORT_BEFORE
    1320             : #undef SORT_IDX_T
    1321             : #undef SORT_KEY_T
    1322             : #undef SORT_NAME

Generated by: LCOV version 1.14