core/slice/sort/stable/drift.rs
1//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
2//! Quicksort.
3
4use crate::mem::MaybeUninit;
5use crate::slice::sort::shared::find_existing_run;
6use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
7use crate::slice::sort::stable::merge::merge;
8use crate::slice::sort::stable::quicksort::quicksort;
9use crate::{cmp, intrinsics};
10
11/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
12/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
13/// worst-case complexity. `scratch.len()` must be at least
14/// `max(v.len() - v.len() / 2, SMALL_SORT_GENERAL_SCRATCH_LEN)` otherwise the implementation may abort.
15/// Fully ascending and descending inputs will be sorted with exactly N - 1
16/// comparisons.
17///
18/// This is the main loop for driftsort, which uses powersort's heuristic to
19/// determine in which order to merge runs, see below for details.
20pub fn sort<T, F: FnMut(&T, &T) -> bool>(
21 v: &mut [T],
22 scratch: &mut [MaybeUninit<T>],
23 eager_sort: bool,
24 is_less: &mut F,
25) {
26 let len = v.len();
27 if len < 2 {
28 return; // Removing this length check *increases* code size.
29 }
30 let scale_factor = merge_tree_scale_factor(len);
31
32 // It's important to have a relatively high entry barrier for pre-sorted
33 // runs, as the presence of a single such run will force on average several
34 // merge operations and shrink the maximum quicksort size a lot. For that
35 // reason we use sqrt(len) as our pre-sorted run threshold.
36 const MIN_SQRT_RUN_LEN: usize = 64;
37 let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
38 // For small input length `MIN_SQRT_RUN_LEN` would break pattern
39 // detection of full or nearly sorted inputs.
40 cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
41 } else {
42 sqrt_approx(len)
43 };
44
45 // (stack_len, runs, desired_depths) together form a stack maintaining run
46 // information for the powersort heuristic. desired_depths[i] is the desired
47 // depth of the merge node that merges runs[i] with the run that comes after
48 // it.
49 let mut stack_len = 0;
50 let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
51 let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
52 let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
53 let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
54
55 let mut scan_idx = 0;
56 let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
57 loop {
58 // Compute the next run and the desired depth of the merge node between
59 // prev_run and next_run. On the last iteration we create a dummy run
60 // with root-level desired depth to fully collapse the merge tree.
61 let (next_run, desired_depth);
62 if scan_idx < len {
63 next_run =
64 create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
65 desired_depth = merge_tree_depth(
66 scan_idx - prev_run.len(),
67 scan_idx,
68 scan_idx + next_run.len(),
69 scale_factor,
70 );
71 } else {
72 next_run = DriftsortRun::new_sorted(0);
73 desired_depth = 0;
74 };
75
76 // Process the merge nodes between earlier runs[i] that have a desire to
77 // be deeper in the merge tree than the merge node for the splitpoint
78 // between prev_run and next_run.
79 //
80 // SAFETY: first note that this is the only place we modify stack_len,
81 // runs or desired depths. We maintain the following invariants:
82 // 1. The first stack_len elements of runs/desired_depths are initialized.
83 // 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
84 // 3. The sum of all valid runs[i].len() plus prev_run.len() equals
85 // scan_idx.
86 unsafe {
87 while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
88 // Desired depth greater than the upcoming desired depth, pop
89 // left neighbor run from stack and merge into prev_run.
90 let left = *runs.add(stack_len - 1);
91 let merged_len = left.len() + prev_run.len();
92 let merge_start_idx = scan_idx - merged_len;
93 let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
94 prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
95 stack_len -= 1;
96 }
97
98 // We now know that desired_depths[stack_len - 1] < desired_depth,
99 // maintaining our invariant. This also guarantees we don't overflow
100 // the stack as merge_tree_depth(..) <= 64 and thus we can only have
101 // 64 distinct values on the stack before pushing, plus our initial
102 // dummy run, while our capacity is 66.
103 *runs.add(stack_len) = prev_run;
104 *desired_depths.add(stack_len) = desired_depth;
105 stack_len += 1;
106 }
107
108 // Break before overriding the last run with our dummy run.
109 if scan_idx >= len {
110 break;
111 }
112
113 scan_idx += next_run.len();
114 prev_run = next_run;
115 }
116
117 if !prev_run.sorted() {
118 stable_quicksort(v, scratch, is_less);
119 }
120}
121
122// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
123// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
124//
125// This method forms a binary merge tree, where each internal node corresponds
126// to a splitting point between the adjacent runs that have to be merged. If we
127// visualize our array as the number line from 0 to 1, we want to find the
128// dyadic fraction with smallest denominator that lies between the midpoints of
129// our to-be-merged slices. The exponent in the dyadic fraction indicates the
130// desired depth in the binary merge tree this internal node wishes to have.
131// This does not always correspond to the actual depth due to the inherent
132// imbalance in runs, but we follow it as closely as possible.
133//
134// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
135// finding the simplest dyadic fraction between midpoints corresponds to finding
136// the most significant bit difference of the midpoints. We save scale_factor =
137// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
138// having to repeatedly do integer divides. This rescaling isn't exact when n is
139// not a power of two since we use integers and not reals, but the result is
140// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
141// approximation errors stay entirely in the lower order bits.
142//
143// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
144// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
145// where CLZ counts the number of leading zeros in an integer and f is our scale
146// factor. Note that we omitted the division by two in the midpoint
147// calculations, as this simply shifts the bits by one position (and thus always
148// adds one to the result), and we only care about the relative depths.
149//
150// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
151// x < (2^62 / n + 1) * 2n
152// x < 2^63 + 2n
153// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
154// overflow.
155#[inline(always)]
156fn merge_tree_scale_factor(n: usize) -> u64 {
157 if usize::BITS > u64::BITS {
158 panic!("Platform not supported");
159 }
160
161 ((1 << 62) + n as u64 - 1) / n as u64
162}
163
164// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
165// differ in some bit, and is <= 64 always.
166#[inline(always)]
167fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
168 let x = left as u64 + mid as u64;
169 let y = mid as u64 + right as u64;
170 ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
171}
172
173fn sqrt_approx(n: usize) -> usize {
174 // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
175 // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
176 // log floors we want to add 0.5 to compensate for this on average, so our
177 // initial approximation is 2^((1 + floor(log2(n))) / 2).
178 //
179 // We then apply an iteration of Newton's method to improve our
180 // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
181 //
182 // Finally we note that the exponentiation / division can be done directly
183 // with shifts. We OR with 1 to avoid zero-checks in the integer log.
184 let ilog = (n | 1).ilog2();
185 let shift = (1 + ilog) / 2;
186 ((1 << shift) + (n >> shift)) / 2
187}
188
189// Lazy logical runs as in Glidesort.
190#[inline(always)]
191fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
192 v: &mut [T],
193 scratch: &mut [MaybeUninit<T>],
194 left: DriftsortRun,
195 right: DriftsortRun,
196 is_less: &mut F,
197) -> DriftsortRun {
198 // If one or both of the runs are sorted do a physical merge, using
199 // quicksort to sort the unsorted run if present. We also *need* to
200 // physically merge if the combined runs would not fit in the scratch space
201 // anymore (as this would mean we are no longer able to quicksort them).
202 let len = v.len();
203 let can_fit_in_scratch = len <= scratch.len();
204 if !can_fit_in_scratch || left.sorted() || right.sorted() {
205 if !left.sorted() {
206 stable_quicksort(&mut v[..left.len()], scratch, is_less);
207 }
208 if !right.sorted() {
209 stable_quicksort(&mut v[left.len()..], scratch, is_less);
210 }
211 merge(v, scratch, left.len(), is_less);
212
213 DriftsortRun::new_sorted(len)
214 } else {
215 DriftsortRun::new_unsorted(len)
216 }
217}
218
219/// Creates a new logical run.
220///
221/// A logical run can either be sorted or unsorted. If there is a pre-existing
222/// run that clears the `min_good_run_len` threshold it is returned as a sorted
223/// run. If not, the result depends on the value of `eager_sort`. If it is true,
224/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
225/// is false an unsorted run of length `min_good_run_len` is returned.
226fn create_run<T, F: FnMut(&T, &T) -> bool>(
227 v: &mut [T],
228 scratch: &mut [MaybeUninit<T>],
229 min_good_run_len: usize,
230 eager_sort: bool,
231 is_less: &mut F,
232) -> DriftsortRun {
233 let len = v.len();
234 if len >= min_good_run_len {
235 let (run_len, was_reversed) = find_existing_run(v, is_less);
236
237 // SAFETY: find_existing_run promises to return a valid run_len.
238 unsafe { intrinsics::assume(run_len <= len) };
239
240 if run_len >= min_good_run_len {
241 if was_reversed {
242 v[..run_len].reverse();
243 }
244
245 return DriftsortRun::new_sorted(run_len);
246 }
247 }
248
249 if eager_sort {
250 // We call quicksort with a len that will immediately call small-sort.
251 // By not calling the small-sort directly here it can always be inlined into
252 // the quicksort itself, making the recursive base case faster and is generally
253 // more binary-size efficient.
254 let eager_run_len = cmp::min(T::small_sort_threshold(), len);
255 quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
256 DriftsortRun::new_sorted(eager_run_len)
257 } else {
258 DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
259 }
260}
261
262fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
263 v: &mut [T],
264 scratch: &mut [MaybeUninit<T>],
265 is_less: &mut F,
266) {
267 // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
268 // The binary OR by one is used to eliminate the zero-check in the logarithm.
269 let limit = 2 * (v.len() | 1).ilog2();
270 quicksort(v, scratch, limit, None, is_less);
271}
272
273/// Compactly stores the length of a run, and whether or not it is sorted. This
274/// can always fit in a `usize` because the maximum slice length is [`isize::MAX`].
275#[derive(Copy, Clone)]
276struct DriftsortRun(usize);
277
278impl DriftsortRun {
279 #[inline(always)]
280 fn new_sorted(length: usize) -> Self {
281 Self((length << 1) | 1)
282 }
283
284 #[inline(always)]
285 fn new_unsorted(length: usize) -> Self {
286 Self(length << 1)
287 }
288
289 #[inline(always)]
290 fn sorted(self) -> bool {
291 self.0 & 1 == 1
292 }
293
294 #[inline(always)]
295 fn len(self) -> usize {
296 self.0 >> 1
297 }
298}