core/slice/sort/stable/
drift.rs

1//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
2//! Quicksort.
3
4use crate::mem::MaybeUninit;
5use crate::slice::sort::shared::find_existing_run;
6use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
7use crate::slice::sort::stable::merge::merge;
8use crate::slice::sort::stable::quicksort::quicksort;
9use crate::{cmp, intrinsics};
10
11/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
12/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
13/// worst-case complexity. `scratch.len()` must be at least
14/// `max(v.len() - v.len() / 2, SMALL_SORT_GENERAL_SCRATCH_LEN)` otherwise the implementation may abort.
15/// Fully ascending and descending inputs will be sorted with exactly N - 1
16/// comparisons.
17///
18/// This is the main loop for driftsort, which uses powersort's heuristic to
19/// determine in which order to merge runs, see below for details.
20pub fn sort<T, F: FnMut(&T, &T) -> bool>(
21    v: &mut [T],
22    scratch: &mut [MaybeUninit<T>],
23    eager_sort: bool,
24    is_less: &mut F,
25) {
26    let len = v.len();
27    if len < 2 {
28        return; // Removing this length check *increases* code size.
29    }
30    let scale_factor = merge_tree_scale_factor(len);
31
32    // It's important to have a relatively high entry barrier for pre-sorted
33    // runs, as the presence of a single such run will force on average several
34    // merge operations and shrink the maximum quicksort size a lot. For that
35    // reason we use sqrt(len) as our pre-sorted run threshold.
36    const MIN_SQRT_RUN_LEN: usize = 64;
37    let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
38        // For small input length `MIN_SQRT_RUN_LEN` would break pattern
39        // detection of full or nearly sorted inputs.
40        cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
41    } else {
42        sqrt_approx(len)
43    };
44
45    // (stack_len, runs, desired_depths) together form a stack maintaining run
46    // information for the powersort heuristic. desired_depths[i] is the desired
47    // depth of the merge node that merges runs[i] with the run that comes after
48    // it.
49    let mut stack_len = 0;
50    let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
51    let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
52    let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
53    let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
54
55    let mut scan_idx = 0;
56    let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
57    loop {
58        // Compute the next run and the desired depth of the merge node between
59        // prev_run and next_run. On the last iteration we create a dummy run
60        // with root-level desired depth to fully collapse the merge tree.
61        let (next_run, desired_depth);
62        if scan_idx < len {
63            next_run =
64                create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
65            desired_depth = merge_tree_depth(
66                scan_idx - prev_run.len(),
67                scan_idx,
68                scan_idx + next_run.len(),
69                scale_factor,
70            );
71        } else {
72            next_run = DriftsortRun::new_sorted(0);
73            desired_depth = 0;
74        };
75
76        // Process the merge nodes between earlier runs[i] that have a desire to
77        // be deeper in the merge tree than the merge node for the splitpoint
78        // between prev_run and next_run.
79        //
80        // SAFETY: first note that this is the only place we modify stack_len,
81        // runs or desired depths. We maintain the following invariants:
82        //  1. The first stack_len elements of runs/desired_depths are initialized.
83        //  2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
84        //  3. The sum of all valid runs[i].len() plus prev_run.len() equals
85        //     scan_idx.
86        unsafe {
87            while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
88                // Desired depth greater than the upcoming desired depth, pop
89                // left neighbor run from stack and merge into prev_run.
90                let left = *runs.add(stack_len - 1);
91                let merged_len = left.len() + prev_run.len();
92                let merge_start_idx = scan_idx - merged_len;
93                let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
94                prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
95                stack_len -= 1;
96            }
97
98            // We now know that desired_depths[stack_len - 1] < desired_depth,
99            // maintaining our invariant. This also guarantees we don't overflow
100            // the stack as merge_tree_depth(..) <= 64 and thus we can only have
101            // 64 distinct values on the stack before pushing, plus our initial
102            // dummy run, while our capacity is 66.
103            *runs.add(stack_len) = prev_run;
104            *desired_depths.add(stack_len) = desired_depth;
105            stack_len += 1;
106        }
107
108        // Break before overriding the last run with our dummy run.
109        if scan_idx >= len {
110            break;
111        }
112
113        scan_idx += next_run.len();
114        prev_run = next_run;
115    }
116
117    if !prev_run.sorted() {
118        stable_quicksort(v, scratch, is_less);
119    }
120}
121
122// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
123// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
124//
125// This method forms a binary merge tree, where each internal node corresponds
126// to a splitting point between the adjacent runs that have to be merged. If we
127// visualize our array as the number line from 0 to 1, we want to find the
128// dyadic fraction with smallest denominator that lies between the midpoints of
129// our to-be-merged slices. The exponent in the dyadic fraction indicates the
130// desired depth in the binary merge tree this internal node wishes to have.
131// This does not always correspond to the actual depth due to the inherent
132// imbalance in runs, but we follow it as closely as possible.
133//
134// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
135// finding the simplest dyadic fraction between midpoints corresponds to finding
136// the most significant bit difference of the midpoints. We save scale_factor =
137// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
138// having to repeatedly do integer divides. This rescaling isn't exact when n is
139// not a power of two since we use integers and not reals, but the result is
140// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
141// approximation errors stay entirely in the lower order bits.
142//
143// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
144// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
145// where CLZ counts the number of leading zeros in an integer and f is our scale
146// factor. Note that we omitted the division by two in the midpoint
147// calculations, as this simply shifts the bits by one position (and thus always
148// adds one to the result), and we only care about the relative depths.
149//
150// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
151//    x < (2^62 / n + 1) * 2n
152//    x < 2^63 + 2n
153// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
154// overflow.
155#[inline(always)]
156fn merge_tree_scale_factor(n: usize) -> u64 {
157    if usize::BITS > u64::BITS {
158        panic!("Platform not supported");
159    }
160
161    ((1 << 62) + n as u64 - 1) / n as u64
162}
163
164// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
165// differ in some bit, and is <= 64 always.
166#[inline(always)]
167fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
168    let x = left as u64 + mid as u64;
169    let y = mid as u64 + right as u64;
170    ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
171}
172
173fn sqrt_approx(n: usize) -> usize {
174    // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
175    // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
176    // log floors we want to add 0.5 to compensate for this on average, so our
177    // initial approximation is 2^((1 + floor(log2(n))) / 2).
178    //
179    // We then apply an iteration of Newton's method to improve our
180    // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
181    //
182    // Finally we note that the exponentiation / division can be done directly
183    // with shifts. We OR with 1 to avoid zero-checks in the integer log.
184    let ilog = (n | 1).ilog2();
185    let shift = (1 + ilog) / 2;
186    ((1 << shift) + (n >> shift)) / 2
187}
188
189// Lazy logical runs as in Glidesort.
190#[inline(always)]
191fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
192    v: &mut [T],
193    scratch: &mut [MaybeUninit<T>],
194    left: DriftsortRun,
195    right: DriftsortRun,
196    is_less: &mut F,
197) -> DriftsortRun {
198    // If one or both of the runs are sorted do a physical merge, using
199    // quicksort to sort the unsorted run if present. We also *need* to
200    // physically merge if the combined runs would not fit in the scratch space
201    // anymore (as this would mean we are no longer able to quicksort them).
202    let len = v.len();
203    let can_fit_in_scratch = len <= scratch.len();
204    if !can_fit_in_scratch || left.sorted() || right.sorted() {
205        if !left.sorted() {
206            stable_quicksort(&mut v[..left.len()], scratch, is_less);
207        }
208        if !right.sorted() {
209            stable_quicksort(&mut v[left.len()..], scratch, is_less);
210        }
211        merge(v, scratch, left.len(), is_less);
212
213        DriftsortRun::new_sorted(len)
214    } else {
215        DriftsortRun::new_unsorted(len)
216    }
217}
218
219/// Creates a new logical run.
220///
221/// A logical run can either be sorted or unsorted. If there is a pre-existing
222/// run that clears the `min_good_run_len` threshold it is returned as a sorted
223/// run. If not, the result depends on the value of `eager_sort`. If it is true,
224/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
225/// is false an unsorted run of length `min_good_run_len` is returned.
226fn create_run<T, F: FnMut(&T, &T) -> bool>(
227    v: &mut [T],
228    scratch: &mut [MaybeUninit<T>],
229    min_good_run_len: usize,
230    eager_sort: bool,
231    is_less: &mut F,
232) -> DriftsortRun {
233    let len = v.len();
234    if len >= min_good_run_len {
235        let (run_len, was_reversed) = find_existing_run(v, is_less);
236
237        // SAFETY: find_existing_run promises to return a valid run_len.
238        unsafe { intrinsics::assume(run_len <= len) };
239
240        if run_len >= min_good_run_len {
241            if was_reversed {
242                v[..run_len].reverse();
243            }
244
245            return DriftsortRun::new_sorted(run_len);
246        }
247    }
248
249    if eager_sort {
250        // We call quicksort with a len that will immediately call small-sort.
251        // By not calling the small-sort directly here it can always be inlined into
252        // the quicksort itself, making the recursive base case faster and is generally
253        // more binary-size efficient.
254        let eager_run_len = cmp::min(T::small_sort_threshold(), len);
255        quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
256        DriftsortRun::new_sorted(eager_run_len)
257    } else {
258        DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
259    }
260}
261
262fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
263    v: &mut [T],
264    scratch: &mut [MaybeUninit<T>],
265    is_less: &mut F,
266) {
267    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
268    // The binary OR by one is used to eliminate the zero-check in the logarithm.
269    let limit = 2 * (v.len() | 1).ilog2();
270    quicksort(v, scratch, limit, None, is_less);
271}
272
273/// Compactly stores the length of a run, and whether or not it is sorted. This
274/// can always fit in a `usize` because the maximum slice length is [`isize::MAX`].
275#[derive(Copy, Clone)]
276struct DriftsortRun(usize);
277
278impl DriftsortRun {
279    #[inline(always)]
280    fn new_sorted(length: usize) -> Self {
281        Self((length << 1) | 1)
282    }
283
284    #[inline(always)]
285    fn new_unsorted(length: usize) -> Self {
286        Self(length << 1)
287    }
288
289    #[inline(always)]
290    fn sorted(self) -> bool {
291        self.0 & 1 == 1
292    }
293
294    #[inline(always)]
295    fn len(self) -> usize {
296        self.0 >> 1
297    }
298}