LCOV - code coverage report
Current view: top level - util/tmpl - fd_sort.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 190 199 95.5 %
Date: 2024-11-13 11:58:15 Functions: 51 120132 0.1 %

          Line data    Source code
       1             : /* Declares a family of functions useful for single threaded sorting of
       2             :    POD types in high performance contexts.  Example usage:
       3             : 
       4             :      #define SORT_NAME        sort_double_descend
       5             :      #define SORT_KEY_T       double
       6             :      #define SORT_BEFORE(a,b) (a)>(b)
       7             :      #include "util/tmpl/fd_sort.c"
       8             : 
       9             :    will create the following API for use in the local compile unit:
      10             : 
      11             :      // Sort key[i] for i in [0,cnt) stable in place in a best / average
      12             :      // worst case of O(N) / O(N^2) / O(N^2) operations.
      13             : 
      14             :      double *
      15             :      sort_double_descend_insert( double * key,
      16             :                                  ulong    cnt );
      17             : 
      18             :      // Return 1 if cnt is a sensible value for the stable sorting APIs
      19             :      // (i.e. cnt*sizeof(double)+alignof(double)-1 will not overflow).
      20             : 
      21             :      int sort_double_descend_stable_cnt_valid( ulong cnt );
      22             : 
      23             :      // Return the alignment and footprint required for a scratch region
      24             :      // adequate for sorting up to cnt elements.  The results will be
      25             :      // standard allocator and standard declaration friendly.  (E.g. a
      26             :      // declaration "double scratch[ cnt ];" will be fine as a scratch
      27             :      // region.)
      28             : 
      29             :      ulong sort_double_descend_stable_scratch_align    ( void      );
      30             :      ulong sort_double_descend_stable_scratch_footprint( ulong cnt );
      31             : 
      32             :      // Sort key[i] for i in [0,cnt) stable in best / average / worst
      33             :      // case of O(N lg N) / O(N lg N) / O(N lg N) operations.  Scratch
      34             :      // is a scratch workspace of suitable alignment and footprint.
      35             :      // Returns where the sorted values ended up.  Will be at either key
      36             :      // or (double *)scratch.
      37             : 
      38             :      double *
      39             :      sort_double_descend_stable_fast( double * key,
      40             :                                       ulong    cnt,
      41             :                                       void   * scratch );
      42             : 
      43             :      // Same as above but does additional copying if necessary such that
      44             :      // the final result ends up in key.  Returns key.
      45             : 
      46             :      double *
      47             :      sort_double_descend_stable( double * key,
      48             :                                  ulong    cnt,
      49             :                                  void *   scratch );
      50             : 
      51             :      // Sort key[i] for i in [0,cnt) inplace in best / average / worst
      52             :      // case of O(N lg N) / O(N lg N) / O(N^2) operations.  Returns key.
      53             : 
      54             :      double *
      55             :      sort_double_inplace( double * key,
      56             :                           ulong    cnt );
      57             : 
      58             :      // Partially sort key[i] such that key[rnk] holds rnk and return
      59             :      // key[rnk] in best / average / worst of O(N) / O(N) / O(N^2)
      60             :      // operations.  Returns key.  Assumes key is non-NULL, cnt is valid
      61             :      // and positive and rnk is in [0,cnt)
      62             : 
      63             :      double *
      64             :      sort_double_select( double * key,
      65             :                          ulong    cnt,
      66             :                          ulong    rnk );
      67             : 
      68             :      // Binary search for element equal-or-greater than key in a sorted
      69             :      // list.  Return value is an index into sorted in [0,cnt].  If
      70             :      // return value > 0 then it is the largest index where
      71             :      // SORT_BEFORE(sorted[index-1],query)==1.   Returns 0 if
      72             :      // SORT_BEFORE(query,elem)==1 for all elements in sorted.  Assumes
      73             :      // SORT_BEFORE defines a total order over sorted, and each element
      74             :      // in sorted is less-or-equal than its successor.  (Returns
      75             :      // incorrect result otherwise!)   Robust if cnt==0 (in that case,
      76             :      // returns 0 and ignores pointer to sorted).
      77             : 
      78             :      ulong
      79             :      sort_double_search_geq( double const * sorted,
      80             :                              ulong          cnt,
      81             :                              double         query );
      82             : 
      83             :    It is fine to include this template multiple times in a compilation
      84             :    unit.  Just provide the specification before each inclusion.  Various
      85             :    additional options to tune the methods are described below. */
      86             : 
      87             : #include "../bits/fd_bits.h"
      88             : 
      89             : /* SORT_NAME gives the name of the function to declare (and the base
      90             :    name of auxiliary and/or variant functions). */
      91             : 
      92             : #ifndef SORT_NAME
      93             : #error "SORT_NAME must be defined"
      94             : #endif
      95             : 
      96             : /* SORT_KEY_T gives the POD datatype to sort. */
      97             : 
      98             : #ifndef SORT_KEY_T
      99             : #error "SORT_KEY_T must be defined"
     100             : #endif
     101             : 
     102             : /* SORT_IDX_T gives the data type used to index the arrays */
     103             : 
     104             : #ifndef SORT_IDX_T
     105  1680089144 : #define SORT_IDX_T ulong
     106             : #endif
     107             : 
     108             : /* SORT_BEFORE(a,b) evaluates to 1 if a<b is strictly true.  SAFETY TIP:
     109             :    This is not a 3-way comparison function! */
     110             : 
     111             : #ifndef SORT_BEFORE
     112     9148622 : #define SORT_BEFORE(a,b) (a)<(b)
     113             : #endif
     114             : 
     115             : /* SORT_MERGE_THRESH / SORT_QUICK_THRESH give largest merge/quick
     116             :    partition size where insertion sort will be used.  Should be at least
     117             :    2 (and probably larger than one might expect to get optimal
     118             :    performance). */
     119             : 
     120             : #ifndef SORT_MERGE_THRESH
     121     6340896 : #define SORT_MERGE_THRESH 32
     122             : #endif
     123             : 
     124             : #ifndef SORT_QUICK_THRESH
     125             : #define SORT_QUICK_THRESH 32
     126             : #endif
     127             : 
     128             : /* SORT_QUICK_ORDER_STYLE selects the method used for ordering two keys.
     129             :    Roughly, say 1 for floating point keys (see quick method for
     130             :    details). */
     131             : 
     132             : #ifndef SORT_QUICK_ORDER_STYLE
     133             : #define SORT_QUICK_ORDER_STYLE 0
     134             : #endif
     135             : 
     136             : /* SORT_QUICK_SWAP_MINIMIZE indicates that quick sort should eat the
     137             :    non-deterministic branch cost to avoid no-op swaps.  This is useful
     138             :    if the key_t has a large memory footprint such that the no-op is
     139             :    actually a larger cost than a branch mispredict. */
     140             : 
     141             : #ifndef SORT_QUICK_SWAP_MINIMIZE
     142             : #define SORT_QUICK_SWAP_MINIMIZE 0
     143             : #endif
     144             : 
     145             : /* 0 - local use only
     146             :    1 - library header declaration
     147             :    2 - library implementation */
     148             : 
     149             : #ifndef SORT_IMPL_STYLE
     150             : #define SORT_IMPL_STYLE 0
     151             : #endif
     152             : 
     153             : /* SORT_IDX_IF(c,t,f) returns sort_idx t if c is non-zero and sort_idx f o.w. */
     154             : 
     155             : #ifndef SORT_IDX_IF
     156   576125463 : #define SORT_IDX_IF(c,t,f) ((SORT_IDX_T)fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
     157             : #endif
     158             : 
     159             : /**********************************************************************/
     160             : 
     161    13857882 : #define SORT_(x)FD_EXPAND_THEN_CONCAT3(SORT_NAME,_,x)
     162             : 
     163             : #if SORT_IMPL_STYLE==1 /* need prototypes */
     164             : 
     165             : SORT_KEY_T *
     166             : SORT_(private_merge)( SORT_KEY_T * key,
     167             :                       SORT_IDX_T   cnt,
     168             :                       SORT_KEY_T * tmp );
     169             : 
     170             : SORT_KEY_T *
     171             : SORT_(private_quick)( SORT_KEY_T * key,
     172             :                       SORT_IDX_T   cnt );
     173             : 
     174             : SORT_KEY_T *
     175             : SORT_(private_select)( SORT_KEY_T * key,
     176             :                        SORT_IDX_T   cnt,
     177             :                        SORT_IDX_T   rnk );
     178             : 
     179             : #else /* need implementations */
     180             : 
     181             : #if SORT_IMPL_STYLE==0 /* local only */
     182             : #define SORT_IMPL_STATIC static
     183             : #else
     184             : #define SORT_IMPL_STATIC
     185             : #endif
     186             : 
     187             : static SORT_KEY_T *
     188             : SORT_(insert)( SORT_KEY_T * key,
     189     5240919 :                SORT_IDX_T   cnt ) {
     190   118914903 :   for( SORT_IDX_T i=((SORT_IDX_T)1); i<cnt; i++ ) {
     191   113673984 :     SORT_KEY_T key_i = key[i];
     192   113673984 :     SORT_IDX_T j = i;
     193   937163512 :     while( j ) {
     194   933279104 :       SORT_IDX_T k = j - ((SORT_IDX_T)1);
     195   933279104 :       SORT_KEY_T key_j = key[k]; if( !(SORT_BEFORE( key_i, key_j )) ) break;
     196   823489528 :       key[j] = key_j;
     197   823489528 :       j = k;
     198   823489528 :     }
     199   113673984 :     key[j] = key_i;
     200   113673984 :   }
     201     5240919 :   return key;
     202     5240919 : }
     203             : 
     204             : /* FIXME: USE IMPLICIT RECURSION ALA QUICK BELOW? */
     205             : 
     206             : SORT_IMPL_STATIC SORT_KEY_T *
     207             : SORT_(private_merge)( SORT_KEY_T * key,
     208             :                       SORT_IDX_T   cnt,
     209     6340896 :                       SORT_KEY_T * tmp ) {
     210             : 
     211             :   /* If below threshold, use insertion sort */
     212             : 
     213     6340896 :   if( cnt<=((SORT_IDX_T)(SORT_MERGE_THRESH)) ) return SORT_(insert)( key, cnt );
     214             : 
     215             :   /* Otherwise, break input in half and sort the halves */
     216             : 
     217     2659872 :   SORT_KEY_T * key_left  = key;
     218     2659872 :   SORT_KEY_T * tmp_left  = tmp;
     219     2659872 :   SORT_IDX_T   cnt_left  = cnt >> 1;
     220     2659872 :   SORT_KEY_T * out_left  = SORT_(private_merge)( key_left,  cnt_left,  tmp_left  );
     221             : 
     222     2659872 :   SORT_KEY_T * key_right = key + cnt_left;
     223     2659872 :   SORT_KEY_T * tmp_right = tmp + cnt_left;
     224     2659872 :   SORT_IDX_T   cnt_right = cnt - cnt_left;
     225     2659872 :   SORT_KEY_T * out_right = SORT_(private_merge)( key_right, cnt_right, tmp_right );
     226             : 
     227             :   /* Merge out_left / out_right */
     228             : 
     229     2659872 :   if( out_left==key ) key = tmp; /* If out_left overlaps with key, merge into tmp.  Otherwise, merge into key */
     230             : 
     231             :   /* Note that out_left does not overlap with the location for merge
     232             :      output at this point.  out_right might overlap (with the right
     233             :      half) with the location for merge output but this will still work
     234             :      in that case.  */
     235             : 
     236     2659872 :   SORT_IDX_T i = ((SORT_IDX_T)0);
     237     2659872 :   SORT_IDX_T j = ((SORT_IDX_T)0);
     238     2659872 :   SORT_IDX_T k = ((SORT_IDX_T)0);
     239     2659872 : # if 1 /* Minimal C language operations */
     240   144635166 :   for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
     241   144635166 :     if( SORT_BEFORE( out_right[k], out_left[j] ) ) {
     242    52344009 :       key[i++] = out_right[k++];
     243    52344009 :       if( k>=cnt_right ) {
     244      401367 :         do key[i++] = out_left [j++]; while( j<cnt_left  ); /* append left  stragglers (at least one) */
     245      170760 :         break;
     246      170760 :       }
     247    92291157 :     } else {
     248    92291157 :       key[i++] = out_left [j++];
     249    92291157 :       if( j>=cnt_left  ) {
     250    41702559 :         do key[i++] = out_right[k++]; while( k<cnt_right ); /* append right stragglers (at least one) */
     251     2489112 :         break;
     252     2489112 :       }
     253    92291157 :     }
     254   144635166 :   }
     255             : # else /* Branch free variant (Pyth investigations suggested the above is better) */
     256             :   for(;;) { /* Note that cnt_left>0 and cnt_right>0 as cnt > SORT_MERGE_THRESH >= 1 at this point */
     257             :     SORT_KEY_T out_j = out_left [j];
     258             :     SORT_KEY_T out_k = out_right[k];
     259             :     int from_right = (SORT_BEFORE( out_k, out_j ));
     260             :     key[i++] = from_right ? out_k : out_j; FD_COMPILER_FORGET( from_right );
     261             :     j += (SORT_IDX_T)(from_right^1);       FD_COMPILER_FORGET( from_right );
     262             :     k += (SORT_IDX_T) from_right;
     263             :     int c = (j<cnt_left) & (k<cnt_right); FD_COMPILER_FORGET( c ); /* prevent compiler from branch nesting the compares */
     264             :     if( !c ) break;
     265             :   }
     266             :   for( ; j<cnt_left;  j++ ) key[i++] = out_left [j];
     267             :   for( ; k<cnt_right; k++ ) key[i++] = out_right[k];
     268             : # endif
     269             : 
     270     2659872 :   return key;
     271     6340896 : }
     272             : 
     273             : /* This uses a dual pivot quick sort for better theoretical and
     274             :    practical mojo. */
     275             : 
     276             : SORT_IMPL_STATIC SORT_KEY_T *
     277             : SORT_(private_quick)( SORT_KEY_T * key,
     278     2084559 :                       SORT_IDX_T   cnt ) {
     279     2084559 :   SORT_IDX_T stack[ 4UL*8UL*sizeof(SORT_IDX_T) ]; /* See note below on sizing */
     280     2084559 :   ulong      stack_cnt = 0UL;
     281             : 
     282     2084559 :   stack[ stack_cnt++ ] = (SORT_IDX_T)0;
     283     2084559 :   stack[ stack_cnt++ ] = cnt;
     284     6633585 :   for(;;) {
     285             : 
     286             :     /* Pop the next partition to process */
     287             : 
     288     6633585 :     if( !stack_cnt ) return key; /* All done */
     289     4549026 :     SORT_IDX_T h = stack[ --stack_cnt ];
     290     4549026 :     SORT_IDX_T l = stack[ --stack_cnt ];
     291             : 
     292             :     /* [l,h) is the partition we are sorting.  If this partition is
     293             :        small enough, sort it via insertion sort. */
     294             : 
     295     4549026 :     SORT_IDX_T n = h-l;
     296     4549026 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) {
     297     1288443 :       SORT_(insert)( key+l, n );
     298     1288443 :       continue;
     299     1288443 :     }
     300             : 
     301             :     /* This partition is too large to insertion sort.  Pick two pivots,
     302             :        sort the partition into a left, center and right partition and
     303             :        then recursively sort those partitions.  We initially use a
     304             :        simple choice of pivots that is ideal for nearly sorted data (in
     305             :        either direction) with a little extra sampling to improve the
     306             :        partitioning for randomly ordered data.  There is a possibility
     307             :        that this deterministic choice of pivots will not succeed in
     308             :        making two or more non-empty partitions; we detect and correct
     309             :        that below. */
     310             : 
     311             :     /* Initial pivot selection */
     312             : 
     313     3260583 :     SORT_KEY_T p1;
     314     3260583 :     SORT_KEY_T p2;
     315     3260583 :     do {
     316     3260583 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
     317     3260583 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     318     3260583 :       SORT_KEY_T p0 = key[ l       ];
     319     3260583 :       /**/       p1 = key[ l  + n3 ];
     320     3260583 :       /**/       p2 = key[ h1 - n3 ];
     321     3260583 :       SORT_KEY_T p3 = key[ h1      ];
     322     3260583 : #     if SORT_QUICK_ORDER_STYLE==0
     323             :       /* Generates better code for integral types (branchless via stack) */
     324     3260583 :       SORT_KEY_T _k[2]; ulong _c;
     325    16302915 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     326             : #     else
     327             :       /* Generates much better code for floating point types (branchless
     328             :          via min / max floating point instructions) */
     329             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     330             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     331             : #     endif
     332             : 
     333     3260583 :       ORDER(p0,p2); ORDER(p1,p3);
     334     3260583 :       ORDER(p0,p1); ORDER(p2,p3);
     335     3260583 :       ORDER(p1,p2);
     336     3260583 : #     undef ORDER
     337     3260583 :     } while(0);
     338             : 
     339     3648915 :   retry: /* Three way partitioning (silly language restriction) */;
     340     3648915 :     SORT_IDX_T i = l;
     341     3648915 :     SORT_IDX_T j = l;
     342     3648915 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     343   509947599 :     do { /* Note [j,k] is non-empty (look ma ... no branches!) */
     344             : 
     345             :       /* At this point, p1 <= p2 and:
     346             :          - keys [l,i) are before p1 (left partition)
     347             :          - keys [i,j) are in [p1,p2] (center partition)
     348             :          - keys [j,k] are not yet partitioned (region is non-empty)
     349             :          - keys (k,h) are after p2 (right partition)
     350             :          Decide where key j should be moved. */
     351             : 
     352   509947599 :       SORT_KEY_T kj = key[j];
     353             : 
     354   509947599 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     355   509947599 :       int to_right = (SORT_BEFORE( p2, kj ));
     356   509947599 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     357             : 
     358             : #     if SORT_QUICK_SWAP_MINIMIZE
     359             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
     360             : #     else
     361             :       /**/                      key[j] = key[m]; key[m] = kj;
     362   509947599 : #     endif
     363             : 
     364   509947599 :       i += (SORT_IDX_T) to_left;
     365   509947599 :       j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
     366   509947599 :       k -= (SORT_IDX_T) to_right;
     367   509947599 :     } while( j<=k );
     368             : 
     369             :     /* Schedule recursion */
     370             : 
     371     3648915 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     372             : 
     373             :       /* All the keys ended up in the center partition.  If p1<p2, we
     374             :          had an unlucky choice of pivots such that p1==min(key[i]) and
     375             :          p2==max(key[i]) for i in [l,h).  Since we picked the keys
     376             :          deterministically above, we would have the same bad luck the
     377             :          next time we processed this partition.  But, because there are
     378             :          at least two distinct elements in the partition, there is a
     379             :          choice of pivots that will make a non-trivial partition, so we
     380             :          need to redo this partition with different pivots.
     381             : 
     382             :          Randomly picking a new pivot pair in [l,h) has probability
     383             :          between ~50% and ~100%.  The worst case is half the keys in
     384             :          the partition equal to p1 and the other half equal to p2; the
     385             :          best case exactly one key is equal to p1 / p2 and all other
     386             :          keys are in (p1,p2] / [p1,p2).
     387             : 
     388             :          The optimal handling of the worst case though is just to set
     389             :          p1=p2 (p2=p1).  This will pull the keys equal to p1 (p2) into
     390             :          the left (right) partition and the remaining keys into the
     391             :          center partition; the right (left) partition will be empty.
     392             :          Thus, this is guaranteed to yield a non-trivial partition of
     393             :          the center partition but may not yield as load balanced set of
     394             :          partitions as random for the next iteration.  We go with the
     395             :          deterministic option for simplicity below. */
     396             : 
     397     1550499 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     398             : 
     399             :       /* p1==p2 here ... all keys in this partition are the same
     400             :          We don't need to recurse further in this partition. */
     401             : 
     402     2098416 :     } else {
     403             : 
     404             :       /* Between [1,n-1] keys ended up in the center partition such that
     405             :          at least 1 key ended up in another partition.  We push all
     406             :          three partitions onto the partition stack for recursion.  The
     407             :          order of the pushes is important to bound the maximum stack
     408             :          usage.  Below, we push the center partition followed by the
     409             :          larger of the left and right partitions then followed by the
     410             :          smaller of the left and right partitions (such that we process
     411             :          the smallest next iteration).  For this, the same O(log_2 cnt)
     412             :          max recursion depth applies as quicksort even this does a three
     413             :          way partitioning.  That is, let fl/fc/fr be the fraction of
     414             :          keys in the left, center, right partitions.  At this point, fl
     415             :          is in [0,1), fc is in (0,1) and fr is in [0,1) and fl+fc+fr=1.
     416             :          The smaller of the left or right partition is what process next
     417             :          and, as the smaller of the left or right, it fraction of the
     418             :          keys is at most 0.5(1-fc)<=0.5.  We do push up to two
     419             :          partitions onto the stack (four SORT_IDX_T) each level of
     420             :          recursion though instead of one like normal quick sort though. */
     421             : 
     422     2098416 :       int left_larger = (i-l) > (h-j); /* True if the left partition should go on the stack */
     423     2098416 :       SORT_IDX_T l_larger  = SORT_IDX_IF( left_larger, l, j );
     424     2098416 :       SORT_IDX_T h_larger  = SORT_IDX_IF( left_larger, i, h );
     425     2098416 :       SORT_IDX_T l_smaller = SORT_IDX_IF( left_larger, j, l );
     426     2098416 :       SORT_IDX_T h_smaller = SORT_IDX_IF( left_larger, h, i );
     427             : 
     428             :       /* Immediately process empty partitions */
     429     2098416 :       ulong push_smaller = (ulong)(l_smaller<h_smaller); FD_COMPILER_FORGET( push_smaller );
     430             : 
     431             :       /* Immediately process partitions where all keys are the same */
     432     2098416 :       ulong push_center  = (ulong)(SORT_BEFORE( p1, p2 )); FD_COMPILER_FORGET( push_center );
     433             : 
     434             :       /* Guaranteed non-empty (larger of left or right have at least 1 key) */
     435     2098416 :       ulong push_larger  = 1UL;
     436             : 
     437     2098416 :       stack[ stack_cnt ] = l_larger;  stack_cnt += push_larger;
     438     2098416 :       stack[ stack_cnt ] = h_larger;  stack_cnt += push_larger;
     439     2098416 :       stack[ stack_cnt ] = i;         stack_cnt += push_center;
     440     2098416 :       stack[ stack_cnt ] = j;         stack_cnt += push_center;
     441     2098416 :       stack[ stack_cnt ] = l_smaller; stack_cnt += push_smaller;
     442     2098416 :       stack[ stack_cnt ] = h_smaller; stack_cnt += push_smaller;
     443     2098416 :     }
     444     3648915 :   }
     445             :   /* never get here */
     446     2084559 : }
     447             : 
     448             : SORT_IMPL_STATIC SORT_KEY_T *
     449             : SORT_(private_select)( SORT_KEY_T * key,
     450             :                        SORT_IDX_T   cnt,
     451      231492 :                        SORT_IDX_T   rnk ) {
     452      231492 :   SORT_IDX_T l = (SORT_IDX_T)0;
     453      231492 :   SORT_IDX_T h = cnt;
     454      748719 :   for(;;) {
     455             : 
     456             :     /* The partition [l,h) contains rnk.  If this partition is small
     457             :        enough, sort it via insertion sort. */
     458             : 
     459      748719 :     SORT_IDX_T n = h-l;
     460      748719 :     if( FD_LIKELY( n <= ((SORT_IDX_T)(SORT_QUICK_THRESH)) ) ) { SORT_(insert)( key+l, n ); return key; }
     461             : 
     462             :     /* This partition is too large to insertion sort.  Pick two pivots,
     463             :        sort the partition into a left, center and right partition and
     464             :        then recursive on the partition holding rnk.  We initially use a
     465             :        simple choice of pivots that is ideal for nearly sorted data (in
     466             :        either direction) with a little extra sampling to improve the
     467             :        partitioning for randomly ordered data.  There is a possibility
     468             :        that this deterministic choice of pivots will not succeed in
     469             :        making two or more non-empty partitions; we detect and correct
     470             :        that below. */
     471             : 
     472             :     /* Initial pivot selection */
     473             : 
     474      517251 :     SORT_KEY_T p1;
     475      517251 :     SORT_KEY_T p2;
     476      517251 :     do {
     477      517251 :       SORT_IDX_T n3 = n / (SORT_IDX_T)3; /* magic multiply under the hood typically */
     478      517251 :       SORT_IDX_T h1 = h - (SORT_IDX_T)1;
     479      517251 :       SORT_KEY_T p0 = key[ l       ];
     480      517251 :       /**/       p1 = key[ l  + n3 ];
     481      517251 :       /**/       p2 = key[ h1 - n3 ];
     482      517251 :       SORT_KEY_T p3 = key[ h1      ];
     483      517251 : #     if SORT_QUICK_ORDER_STYLE==0
     484             :       /* Generates better code for integral types (branchless via stack) */
     485      517251 :       SORT_KEY_T _k[2]; ulong _c;
     486     2586255 : #     define ORDER(k0,k1) _k[0] = k0; _k[1] = k1; _c = (ulong)(SORT_BEFORE(k1,k0)); k0 = _k[_c]; k1 = _k[_c^1UL];
     487             : #     else
     488             :       /* Generates much better code for floating point types (branchless
     489             :          via min / max floating point instructions) */
     490             :       SORT_KEY_T _k0; SORT_KEY_T _k1; int _c;
     491             : #     define ORDER(k0,k1) _c = (SORT_BEFORE(k1,k0)); _k0 = _c ? k1 : k0; _k1 = _c ? k0 : k1; k0 = _k0; k1 = _k1;
     492             : #     endif
     493             : 
     494      517251 :       ORDER(p0,p2); ORDER(p1,p3);
     495      517251 :       ORDER(p0,p1); ORDER(p2,p3);
     496      517251 :       ORDER(p1,p2);
     497      517251 : #     undef ORDER
     498      517251 :     } while(0);
     499             : 
     500      517263 :   retry: /* Three way partitioning (silly language restriction) */;
     501      517263 :     SORT_IDX_T i = l;
     502      517263 :     SORT_IDX_T j = l;
     503      517263 :     SORT_IDX_T k = h-(SORT_IDX_T)1;
     504    57784200 :     do { /* Note [j,k] is non-empty (look ma ... no branches!) */
     505             : 
     506             :       /* At this point, p1 <= p2 and:
     507             :          - keys [l,i) are before p1 (left partition)
     508             :          - keys [i,j) are in [p1,p2] (center partition)
     509             :          - keys [j,k] are not yet partitioned (region is non-empty)
     510             :          - keys (k,h) are after p2 (right partition)
     511             :          Decide where key j should be moved. */
     512             : 
     513    57784200 :       SORT_KEY_T kj = key[j];
     514             : 
     515    57784200 :       int to_left  = (SORT_BEFORE( kj, p1 ));
     516    57784200 :       int to_right = (SORT_BEFORE( p2, kj ));
     517    57784200 :       SORT_IDX_T m = SORT_IDX_IF( to_right, k, SORT_IDX_IF( to_left, i, j ) );
     518             : 
     519             : #     if SORT_QUICK_SWAP_MINIMIZE
     520             :       if( FD_LIKELY( j!=h ) ) { key[j] = key[m]; key[m] = kj; } /* ~2/3 prob */
     521             : #     else
     522             :       /**/                   key[j] = key[m]; key[m] = kj;
     523    57784200 : #     endif
     524             : 
     525    57784200 :       i += (SORT_IDX_T) to_left;
     526    57784200 :       j += (SORT_IDX_T)(to_right^1); /* to_left_or_to_center <=> !to_right */
     527    57784200 :       k -= (SORT_IDX_T) to_right;
     528    57784200 :     } while( j<=k );
     529             : 
     530      517263 :     if( FD_UNLIKELY( (j-i)==n ) ) {
     531             : 
     532             :       /* All the keys ended up in the center partition.  If p1<p2, we
     533             :          had an unlucky choice of pivots such that p1==min(key[i]) and
     534             :          p2==max(key[i]) for i in [l,h).  Since we picked the keys
     535             :          deterministically above, we would have the same bad luck the
     536             :          next time we processed this partition.  But, because there are
     537             :          at least two distinct elements in the partition, there is a
     538             :          choice of pivots that will make a non-trivial partition, so we
     539             :          need to redo this partition with different pivots.
     540             : 
     541             :          Randomly picking a new pivot pair in [l,h) has probability
     542             :          between ~50% and ~100%.  The worst case is half the keys in
     543             :          the partition equal to p1 and the other half equal to p2; the
     544             :          best case exactly one key is equal to p1 / p2 and all other
     545             :          keys are in (p1,p2] / [p1,p2).
     546             : 
     547             :          The optimal handling of the worst case though is just to set
     548             :          p1=p2 (p2=p1).  This will pull the keys equal to p1 (p2) into
     549             :          the left (right) partition and the remaining keys into the
     550             :          center partition; the right (left) partition will be empty.
     551             :          Thus, this is guaranteed to yield a non-trivial partition of
     552             :          the center partition but may not yield as load balanced set of
     553             :          partitions as random for the next iteration.  We go with the
     554             :          deterministic option for simplicity below. */
     555             : 
     556          36 :       if( FD_UNLIKELY( (SORT_BEFORE( p1, p2 )) ) ) { p1 = p2; goto retry; }
     557             : 
     558             :       /* p1==p2 here ... all keys in this partition are the same.  We
     559             :          don't need to recurse further in this partition as all keys
     560             :          would do here. */
     561             : 
     562          24 :       return key;
     563          36 :     }
     564             : 
     565             :     /* At this point:
     566             :        - [l,i) is the left partition,
     567             :        - [i,j) is the center partition
     568             :        - [j,h) is the right partition
     569             :        Between [1,n-1] keys ended up in the center partition such that
     570             :        at least 1 key ended up in another partition.  Recurse on the
     571             :        partition that contains rnk.  As this is typically used in median
     572             :        finding, we assume that the center partition is the most likely
     573             :        in the below selection (this can also be done branchlessly). */
     574             : 
     575      517227 :     SORT_IDX_T l_next = i; SORT_IDX_T h_next = j;
     576      517227 :     if( FD_UNLIKELY( rnk< i   ) ) l_next = l, h_next = i;
     577      517227 :     if( FD_UNLIKELY( j  <=rnk ) ) l_next = j, h_next = h;
     578      517227 :     l = l_next; h = h_next;
     579      517227 :   }
     580             :   /* never get here */
     581      231492 : }
     582             : 
     583             : #undef SORT_IMPL_STATIC
     584             : 
     585             : #endif
     586             : 
     587             : #if SORT_IMPL_STYLE==0 || SORT_IMPL_STYLE==1 /* need interface */
     588             : 
     589           0 : FD_FN_CONST static inline int SORT_(stable_cnt_valid)( SORT_IDX_T cnt ) {
     590           0 :   /* Written funny for supporting different signed idx types without
     591           0 :      generating compiler warnings */
     592           0 :   SORT_IDX_T max = (SORT_IDX_T)fd_ulong_min( (-alignof(SORT_KEY_T))/sizeof(SORT_KEY_T), /* ==floor( (2^64-align-1)/sz ) */
     593           0 :                                              (ulong)(SORT_IDX_T)((~0UL)>>1) );          /* ==SORT_IDX_MAX as ulong */
     594           0 :   return (!cnt) | ((((SORT_IDX_T)0)<cnt) & (cnt<max)) | (cnt==max);
     595           0 : }
     596             : 
     597           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_align)    ( void )           { return alignof(SORT_KEY_T); }
     598           0 : FD_FN_CONST static inline ulong SORT_(stable_scratch_footprint)( SORT_IDX_T cnt ) { return sizeof (SORT_KEY_T)*(ulong)cnt; }
     599             : 
     600             : static inline SORT_KEY_T *
     601             : SORT_(stable_fast)( SORT_KEY_T * key,
     602             :                     SORT_IDX_T   cnt,
     603      510576 :                     void *       scratch ) {
     604      510576 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     605      510576 :   return SORT_(private_merge)( key, cnt, tmp );
     606      510576 : }
     607             : 
     608             : FD_FN_UNUSED static SORT_KEY_T * /* Work around -Winline */
     609             : SORT_(stable)( SORT_KEY_T * key,
     610             :                SORT_IDX_T   cnt,
     611      510576 :                void *       scratch ) {
     612      510576 :   SORT_KEY_T * tmp = (SORT_KEY_T *)scratch;
     613     7771710 :   if( SORT_(private_merge)( key, cnt, tmp )==tmp ) for( SORT_IDX_T i=((SORT_IDX_T)0); i<cnt; i++ ) key[i] = tmp[i];
     614      510576 :   return key;
     615      510576 : }
     616             : 
     617             : static inline SORT_KEY_T *
     618             : SORT_(inplace)( SORT_KEY_T * key,
     619     2084559 :                 SORT_IDX_T   cnt ) {
     620     2084559 :   return SORT_(private_quick)( key, cnt );
     621     2084559 : }
     622             : 
     623             : static inline SORT_KEY_T *
     624             : SORT_(select)( SORT_KEY_T * key,
     625             :                SORT_IDX_T   cnt,
     626      231492 :                SORT_IDX_T   rnk ) {
     627      231492 :   return SORT_(private_select)( key, cnt, rnk );
     628      231492 : }
     629             : 
     630             : static inline SORT_IDX_T
     631             : SORT_(search_geq)( SORT_KEY_T const * sorted,
     632             :                    SORT_IDX_T         cnt,
     633       17196 :                    SORT_KEY_T         query ) {
     634       17196 :   SORT_IDX_T j = (SORT_IDX_T)0;
     635       17196 :   SORT_IDX_T n = (SORT_IDX_T)cnt;
     636       91860 :   while( n>((SORT_IDX_T)1) ) {
     637             :     /* At this point the j corresponding to k is in [j,j+n) */
     638       74664 :     SORT_IDX_T j_left  = j;          SORT_IDX_T n_left  = n >> 1;
     639       74664 :     SORT_IDX_T j_right = j + n_left; SORT_IDX_T n_right = n - n_left;
     640       74664 :     int  go_left = SORT_BEFORE( query, sorted[ j_right ] );
     641       74664 :     j = go_left ? j_left : j_right; /* branchless */
     642       74664 :     n = go_left ? n_left : n_right; /* branchless */
     643       74664 :   }
     644       17196 :   return j;
     645       17196 : }
     646             : 
     647             : #endif
     648             : 
     649             : #undef SORT_
     650             : 
     651             : #undef SORT_IDX_IF
     652             : #undef SORT_IMPL_STYLE
     653             : #undef SORT_QUICK_SWAP_MINIMIZE
     654             : #undef SORT_QUICK_ORDER_STYLE
     655             : #undef SORT_QUICK_THRESH
     656             : #undef SORT_MERGE_THRESH
     657             : #undef SORT_BEFORE
     658             : #undef SORT_IDX_T
     659             : #undef SORT_KEY_T
     660             : #undef SORT_NAME
     661             : 

Generated by: LCOV version 1.14