core/slice/sort/stable/
mod.rs

1//! This module contains the entry points for `slice::sort`.
2
3#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
4use crate::cmp;
5use crate::mem::{MaybeUninit, SizedTypeProperties};
6#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
7use crate::slice::sort::shared::smallsort::{
8    SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left,
9};
10use crate::{cfg_match, intrinsics};
11
12pub(crate) mod merge;
13
14#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
15pub(crate) mod drift;
16#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
17pub(crate) mod quicksort;
18
19#[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))]
20pub(crate) mod tiny;
21
22/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
23/// Design document:
24/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
25///
26/// Upholds all safety properties outlined here:
27/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md>
28#[inline(always)]
29pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
30    // Arrays of zero-sized types are always all-equal, and thus sorted.
31    if T::IS_ZST {
32        return;
33    }
34
35    // Instrumenting the standard library showed that 90+% of the calls to sort
36    // by rustc are either of size 0 or 1.
37    let len = v.len();
38    if intrinsics::likely(len < 2) {
39        return;
40    }
41
42    cfg_match! {
43        any(feature = "optimize_for_size", target_pointer_width = "16") => {
44            // Unlike driftsort, mergesort only requires len / 2,
45            // not len - len / 2.
46            let alloc_len = len / 2;
47
48            cfg_match! {
49                target_pointer_width = "16" => {
50                    let mut heap_buf = BufT::with_capacity(alloc_len);
51                    let scratch = heap_buf.as_uninit_slice_mut();
52                }
53                _ => {
54                    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
55                    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
56                    let mut stack_buf = AlignedStorage::<T, 4096>::new();
57                    let stack_scratch = stack_buf.as_uninit_slice_mut();
58                    let mut heap_buf;
59                    let scratch = if stack_scratch.len() >= alloc_len {
60                        stack_scratch
61                    } else {
62                        heap_buf = BufT::with_capacity(alloc_len);
63                        heap_buf.as_uninit_slice_mut()
64                    };
65                }
66            }
67
68            tiny::mergesort(v, scratch, is_less);
69        }
70        _ => {
71            // More advanced sorting methods than insertion sort are faster if called in
72            // a hot loop for small inputs, but for general-purpose code the small
73            // binary size of insertion sort is more important. The instruction cache in
74            // modern processors is very valuable, and for a single sort call in general
75            // purpose code any gains from an advanced method are cancelled by i-cache
76            // misses during the sort, and thrashing the i-cache for surrounding code.
77            const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
78            if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
79                insertion_sort_shift_left(v, 1, is_less);
80                return;
81            }
82
83            driftsort_main::<T, F, BufT>(v, is_less);
84        }
85    }
86}
87
88/// See [`sort`]
89///
90/// Deliberately don't inline the main sorting routine entrypoint to ensure the
91/// inlined insertion sort i-cache footprint remains minimal.
92#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
93#[inline(never)]
94fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
95    // By allocating n elements of memory we can ensure the entire input can
96    // be sorted using stable quicksort, which allows better performance on
97    // random and low-cardinality distributions. However, we still want to
98    // reduce our memory usage to n - n / 2 for large inputs. We do this by scaling
99    // our allocation as max(n - n / 2, min(n, 8MB)), ensuring we scale like n for
100    // small inputs and n - n / 2 for large inputs, without a sudden drop off. We
101    // also need to ensure our alloc >= SMALL_SORT_GENERAL_SCRATCH_LEN, as the
102    // small-sort always needs this much memory.
103    //
104    // driftsort will produce unsorted runs of up to min_good_run_len, which
105    // is at most len - len / 2.
106    // Unsorted runs need to be processed by quicksort, which requires as much
107    // scratch space as the run length, therefore the scratch space must be at
108    // least len - len / 2.
109    // If min_good_run_len is ever modified, this code must be updated to allocate
110    // the correct scratch size for it.
111    const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
112    let max_full_alloc = MAX_FULL_ALLOC_BYTES / size_of::<T>();
113    let len = v.len();
114    let alloc_len = cmp::max(
115        cmp::max(len - len / 2, cmp::min(len, max_full_alloc)),
116        SMALL_SORT_GENERAL_SCRATCH_LEN,
117    );
118
119    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
120    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
121    let mut stack_buf = AlignedStorage::<T, 4096>::new();
122    let stack_scratch = stack_buf.as_uninit_slice_mut();
123    let mut heap_buf;
124    let scratch = if stack_scratch.len() >= alloc_len {
125        stack_scratch
126    } else {
127        heap_buf = BufT::with_capacity(alloc_len);
128        heap_buf.as_uninit_slice_mut()
129    };
130
131    // For small inputs using quicksort is not yet beneficial, and a single
132    // small-sort or two small-sorts plus a single merge outperforms it, so use
133    // eager mode.
134    let eager_sort = len <= T::small_sort_threshold() * 2;
135    crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
136}
137
138#[doc(hidden)]
139/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
140/// possible. This trait can then be implemented in a place that has access to allocation.
141pub trait BufGuard<T> {
142    /// Creates new buffer that holds at least `capacity` memory.
143    fn with_capacity(capacity: usize) -> Self;
144    /// Returns mutable access to uninitialized memory owned by the buffer.
145    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
146}
147
148#[repr(C)]
149struct AlignedStorage<T, const N: usize> {
150    _align: [T; 0],
151    storage: [MaybeUninit<u8>; N],
152}
153
154impl<T, const N: usize> AlignedStorage<T, N> {
155    fn new() -> Self {
156        Self { _align: [], storage: [const { MaybeUninit::uninit() }; N] }
157    }
158
159    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
160        let len = N / size_of::<T>();
161
162        // SAFETY: `_align` ensures we are correctly aligned.
163        unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
164    }
165}