core/slice/sort/stable/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
//! This module contains the entry points for `slice::sort`.

#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
use crate::slice::sort::shared::smallsort::{
    SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left,
};

pub(crate) mod merge;

#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
pub(crate) mod drift;
#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
pub(crate) mod quicksort;

#[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))]
pub(crate) mod tiny;

/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
///
/// Upholds all safety properties outlined here:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md>
#[inline(always)]
pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
    // Arrays of zero-sized types are always all-equal, and thus sorted.
    if T::IS_ZST {
        return;
    }

    // Instrumenting the standard library showed that 90+% of the calls to sort
    // by rustc are either of size 0 or 1.
    let len = v.len();
    if intrinsics::likely(len < 2) {
        return;
    }

    cfg_if! {
        if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
            let alloc_len = len / 2;

            cfg_if! {
                if #[cfg(target_pointer_width = "16")] {
                    let mut heap_buf = BufT::with_capacity(alloc_len);
                    let scratch = heap_buf.as_uninit_slice_mut();
                } else {
                    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
                    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
                    let mut stack_buf = AlignedStorage::<T, 4096>::new();
                    let stack_scratch = stack_buf.as_uninit_slice_mut();
                    let mut heap_buf;
                    let scratch = if stack_scratch.len() >= alloc_len {
                        stack_scratch
                    } else {
                        heap_buf = BufT::with_capacity(alloc_len);
                        heap_buf.as_uninit_slice_mut()
                    };
                }
            }

            tiny::mergesort(v, scratch, is_less);
        } else {
            // More advanced sorting methods than insertion sort are faster if called in
            // a hot loop for small inputs, but for general-purpose code the small
            // binary size of insertion sort is more important. The instruction cache in
            // modern processors is very valuable, and for a single sort call in general
            // purpose code any gains from an advanced method are cancelled by i-cache
            // misses during the sort, and thrashing the i-cache for surrounding code.
            const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
            if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
                insertion_sort_shift_left(v, 1, is_less);
                return;
            }

            driftsort_main::<T, F, BufT>(v, is_less);
        }
    }
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(any(feature = "optimize_for_size", target_pointer_width = "16")))]
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
    // By allocating n elements of memory we can ensure the entire input can
    // be sorted using stable quicksort, which allows better performance on
    // random and low-cardinality distributions. However, we still want to
    // reduce our memory usage to n / 2 for large inputs. We do this by scaling
    // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
    // small inputs and n / 2 for large inputs, without a sudden drop off. We
    // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
    // small-sort always needs this much memory.
    const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
    let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
    let len = v.len();
    let alloc_len =
        cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);

    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
    let mut stack_buf = AlignedStorage::<T, 4096>::new();
    let stack_scratch = stack_buf.as_uninit_slice_mut();
    let mut heap_buf;
    let scratch = if stack_scratch.len() >= alloc_len {
        stack_scratch
    } else {
        heap_buf = BufT::with_capacity(alloc_len);
        heap_buf.as_uninit_slice_mut()
    };

    // For small inputs using quicksort is not yet beneficial, and a single
    // small-sort or two small-sorts plus a single merge outperforms it, so use
    // eager mode.
    let eager_sort = len <= T::small_sort_threshold() * 2;
    crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
}

#[doc(hidden)]
/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
/// possible. This trait can then be implemented in a place that has access to allocation.
pub trait BufGuard<T> {
    /// Creates new buffer that holds at least `capacity` memory.
    fn with_capacity(capacity: usize) -> Self;
    /// Returns mutable access to uninitialized memory owned by the buffer.
    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
}

#[repr(C)]
struct AlignedStorage<T, const N: usize> {
    _align: [T; 0],
    storage: [MaybeUninit<u8>; N],
}

impl<T, const N: usize> AlignedStorage<T, N> {
    fn new() -> Self {
        Self { _align: [], storage: [const { MaybeUninit::uninit() }; N] }
    }

    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
        let len = N / mem::size_of::<T>();

        // SAFETY: `_align` ensures we are correctly aligned.
        unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
    }
}