core/slice/sort/stable/
drift.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
//! Quicksort.

use crate::mem::MaybeUninit;
use crate::slice::sort::shared::find_existing_run;
use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
use crate::slice::sort::stable::merge::merge;
use crate::slice::sort::stable::quicksort::quicksort;
use crate::{cmp, intrinsics};

/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2,
/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort.
/// Fully ascending and descending inputs will be sorted with exactly N - 1
/// comparisons.
///
/// This is the main loop for driftsort, which uses powersort's heuristic to
/// determine in which order to merge runs, see below for details.
pub fn sort<T, F: FnMut(&T, &T) -> bool>(
    v: &mut [T],
    scratch: &mut [MaybeUninit<T>],
    eager_sort: bool,
    is_less: &mut F,
) {
    let len = v.len();
    if len < 2 {
        return; // Removing this length check *increases* code size.
    }
    let scale_factor = merge_tree_scale_factor(len);

    // It's important to have a relatively high entry barrier for pre-sorted
    // runs, as the presence of a single such run will force on average several
    // merge operations and shrink the maximum quicksort size a lot. For that
    // reason we use sqrt(len) as our pre-sorted run threshold.
    const MIN_SQRT_RUN_LEN: usize = 64;
    let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
        // For small input length `MIN_SQRT_RUN_LEN` would break pattern
        // detection of full or nearly sorted inputs.
        cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
    } else {
        sqrt_approx(len)
    };

    // (stack_len, runs, desired_depths) together form a stack maintaining run
    // information for the powersort heuristic. desired_depths[i] is the desired
    // depth of the merge node that merges runs[i] with the run that comes after
    // it.
    let mut stack_len = 0;
    let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
    let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
    let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
    let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();

    let mut scan_idx = 0;
    let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
    loop {
        // Compute the next run and the desired depth of the merge node between
        // prev_run and next_run. On the last iteration we create a dummy run
        // with root-level desired depth to fully collapse the merge tree.
        let (next_run, desired_depth);
        if scan_idx < len {
            next_run =
                create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
            desired_depth = merge_tree_depth(
                scan_idx - prev_run.len(),
                scan_idx,
                scan_idx + next_run.len(),
                scale_factor,
            );
        } else {
            next_run = DriftsortRun::new_sorted(0);
            desired_depth = 0;
        };

        // Process the merge nodes between earlier runs[i] that have a desire to
        // be deeper in the merge tree than the merge node for the splitpoint
        // between prev_run and next_run.
        //
        // SAFETY: first note that this is the only place we modify stack_len,
        // runs or desired depths. We maintain the following invariants:
        //  1. The first stack_len elements of runs/desired_depths are initialized.
        //  2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
        //  3. The sum of all valid runs[i].len() plus prev_run.len() equals
        //     scan_idx.
        unsafe {
            while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
                // Desired depth greater than the upcoming desired depth, pop
                // left neighbor run from stack and merge into prev_run.
                let left = *runs.add(stack_len - 1);
                let merged_len = left.len() + prev_run.len();
                let merge_start_idx = scan_idx - merged_len;
                let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
                prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
                stack_len -= 1;
            }

            // We now know that desired_depths[stack_len - 1] < desired_depth,
            // maintaining our invariant. This also guarantees we don't overflow
            // the stack as merge_tree_depth(..) <= 64 and thus we can only have
            // 64 distinct values on the stack before pushing, plus our initial
            // dummy run, while our capacity is 66.
            *runs.add(stack_len) = prev_run;
            *desired_depths.add(stack_len) = desired_depth;
            stack_len += 1;
        }

        // Break before overriding the last run with our dummy run.
        if scan_idx >= len {
            break;
        }

        scan_idx += next_run.len();
        prev_run = next_run;
    }

    if !prev_run.sorted() {
        stable_quicksort(v, scratch, is_less);
    }
}

// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
//
// This method forms a binary merge tree, where each internal node corresponds
// to a splitting point between the adjacent runs that have to be merged. If we
// visualize our array as the number line from 0 to 1, we want to find the
// dyadic fraction with smallest denominator that lies between the midpoints of
// our to-be-merged slices. The exponent in the dyadic fraction indicates the
// desired depth in the binary merge tree this internal node wishes to have.
// This does not always correspond to the actual depth due to the inherent
// imbalance in runs, but we follow it as closely as possible.
//
// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
// finding the simplest dyadic fraction between midpoints corresponds to finding
// the most significant bit difference of the midpoints. We save scale_factor =
// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
// having to repeatedly do integer divides. This rescaling isn't exact when n is
// not a power of two since we use integers and not reals, but the result is
// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
// approximation errors stay entirely in the lower order bits.
//
// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
// where CLZ counts the number of leading zeros in an integer and f is our scale
// factor. Note that we omitted the division by two in the midpoint
// calculations, as this simply shifts the bits by one position (and thus always
// adds one to the result), and we only care about the relative depths.
//
// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
//    x < (2^62 / n + 1) * 2n
//    x < 2^63 + 2n
// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
// overflow.
#[inline(always)]
fn merge_tree_scale_factor(n: usize) -> u64 {
    if usize::BITS > u64::BITS {
        panic!("Platform not supported");
    }

    ((1 << 62) + n as u64 - 1) / n as u64
}

// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
// differ in some bit, and is <= 64 always.
#[inline(always)]
fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
    let x = left as u64 + mid as u64;
    let y = mid as u64 + right as u64;
    ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
}

fn sqrt_approx(n: usize) -> usize {
    // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
    // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
    // log floors we want to add 0.5 to compensate for this on average, so our
    // initial approximation is 2^((1 + floor(log2(n))) / 2).
    //
    // We then apply an iteration of Newton's method to improve our
    // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
    //
    // Finally we note that the exponentiation / division can be done directly
    // with shifts. We OR with 1 to avoid zero-checks in the integer log.
    let ilog = (n | 1).ilog2();
    let shift = (1 + ilog) / 2;
    ((1 << shift) + (n >> shift)) / 2
}

// Lazy logical runs as in Glidesort.
#[inline(always)]
fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
    v: &mut [T],
    scratch: &mut [MaybeUninit<T>],
    left: DriftsortRun,
    right: DriftsortRun,
    is_less: &mut F,
) -> DriftsortRun {
    // If one or both of the runs are sorted do a physical merge, using
    // quicksort to sort the unsorted run if present. We also *need* to
    // physically merge if the combined runs would not fit in the scratch space
    // anymore (as this would mean we are no longer able to quicksort them).
    let len = v.len();
    let can_fit_in_scratch = len <= scratch.len();
    if !can_fit_in_scratch || left.sorted() || right.sorted() {
        if !left.sorted() {
            stable_quicksort(&mut v[..left.len()], scratch, is_less);
        }
        if !right.sorted() {
            stable_quicksort(&mut v[left.len()..], scratch, is_less);
        }
        merge(v, scratch, left.len(), is_less);

        DriftsortRun::new_sorted(len)
    } else {
        DriftsortRun::new_unsorted(len)
    }
}

/// Creates a new logical run.
///
/// A logical run can either be sorted or unsorted. If there is a pre-existing
/// run that clears the `min_good_run_len` threshold it is returned as a sorted
/// run. If not, the result depends on the value of `eager_sort`. If it is true,
/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
/// is false an unsorted run of length `min_good_run_len` is returned.
fn create_run<T, F: FnMut(&T, &T) -> bool>(
    v: &mut [T],
    scratch: &mut [MaybeUninit<T>],
    min_good_run_len: usize,
    eager_sort: bool,
    is_less: &mut F,
) -> DriftsortRun {
    let len = v.len();
    if len >= min_good_run_len {
        let (run_len, was_reversed) = find_existing_run(v, is_less);

        // SAFETY: find_existing_run promises to return a valid run_len.
        unsafe { intrinsics::assume(run_len <= len) };

        if run_len >= min_good_run_len {
            if was_reversed {
                v[..run_len].reverse();
            }

            return DriftsortRun::new_sorted(run_len);
        }
    }

    if eager_sort {
        // We call quicksort with a len that will immediately call small-sort.
        // By not calling the small-sort directly here it can always be inlined into
        // the quicksort itself, making the recursive base case faster and is generally
        // more binary-size efficient.
        let eager_run_len = cmp::min(T::small_sort_threshold(), len);
        quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
        DriftsortRun::new_sorted(eager_run_len)
    } else {
        DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
    }
}

fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
    v: &mut [T],
    scratch: &mut [MaybeUninit<T>],
    is_less: &mut F,
) {
    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
    // The binary OR by one is used to eliminate the zero-check in the logarithm.
    let limit = 2 * (v.len() | 1).ilog2();
    quicksort(v, scratch, limit, None, is_less);
}

/// Compactly stores the length of a run, and whether or not it is sorted. This
/// can always fit in a `usize` because the maximum slice length is [`isize::MAX`].
#[derive(Copy, Clone)]
struct DriftsortRun(usize);

impl DriftsortRun {
    #[inline(always)]
    fn new_sorted(length: usize) -> Self {
        Self((length << 1) | 1)
    }

    #[inline(always)]
    fn new_unsorted(length: usize) -> Self {
        Self(length << 1)
    }

    #[inline(always)]
    fn sorted(self) -> bool {
        self.0 & 1 == 1
    }

    #[inline(always)]
    fn len(self) -> usize {
        self.0 >> 1
    }
}