core/slice/sort/shared/
smallsort.rs

1//! This module contains a variety of sort implementations that are optimized for small lengths.
2
3use crate::mem::{self, ManuallyDrop, MaybeUninit};
4use crate::slice::sort::shared::FreezeMarker;
5use crate::{intrinsics, ptr, slice};
6
7// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
8// small slices and small-sort performance sorting small sub-slices as part of
9// the main quicksort loop. For the former, testing showed that the
10// representative benchmarks for real-world performance are cold CPU state and
11// not single-size hot benchmarks. For the latter the CPU will call them many
12// times, so hot benchmarks are fine and more realistic. And it's worth it to
13// optimize sorting small sub-slices with more sophisticated solutions than
14// insertion sort.
15
16/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
17/// abstractions.
18pub(crate) trait StableSmallSortTypeImpl: Sized {
19    /// For which input length <= return value of this function, is it valid to call `small_sort`.
20    fn small_sort_threshold() -> usize;
21
22    /// Sorts `v` using strategies optimized for small sizes.
23    fn small_sort<F: FnMut(&Self, &Self) -> bool>(
24        v: &mut [Self],
25        scratch: &mut [MaybeUninit<Self>],
26        is_less: &mut F,
27    );
28}
29
30impl<T> StableSmallSortTypeImpl for T {
31    #[inline(always)]
32    default fn small_sort_threshold() -> usize {
33        // Optimal number of comparisons, and good perf.
34        SMALL_SORT_FALLBACK_THRESHOLD
35    }
36
37    #[inline(always)]
38    default fn small_sort<F: FnMut(&T, &T) -> bool>(
39        v: &mut [T],
40        _scratch: &mut [MaybeUninit<T>],
41        is_less: &mut F,
42    ) {
43        if v.len() >= 2 {
44            insertion_sort_shift_left(v, 1, is_less);
45        }
46    }
47}
48
49impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
50    #[inline(always)]
51    fn small_sort_threshold() -> usize {
52        SMALL_SORT_GENERAL_THRESHOLD
53    }
54
55    #[inline(always)]
56    fn small_sort<F: FnMut(&T, &T) -> bool>(
57        v: &mut [T],
58        scratch: &mut [MaybeUninit<T>],
59        is_less: &mut F,
60    ) {
61        small_sort_general_with_scratch(v, scratch, is_less);
62    }
63}
64
65/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
66/// abstractions.
67pub(crate) trait UnstableSmallSortTypeImpl: Sized {
68    /// For which input length <= return value of this function, is it valid to call `small_sort`.
69    fn small_sort_threshold() -> usize;
70
71    /// Sorts `v` using strategies optimized for small sizes.
72    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
73}
74
75impl<T> UnstableSmallSortTypeImpl for T {
76    #[inline(always)]
77    default fn small_sort_threshold() -> usize {
78        SMALL_SORT_FALLBACK_THRESHOLD
79    }
80
81    #[inline(always)]
82    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
83    where
84        F: FnMut(&T, &T) -> bool,
85    {
86        small_sort_fallback(v, is_less);
87    }
88}
89
90impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
91    #[inline(always)]
92    fn small_sort_threshold() -> usize {
93        <T as UnstableSmallSortFreezeTypeImpl>::small_sort_threshold()
94    }
95
96    #[inline(always)]
97    fn small_sort<F>(v: &mut [T], is_less: &mut F)
98    where
99        F: FnMut(&T, &T) -> bool,
100    {
101        <T as UnstableSmallSortFreezeTypeImpl>::small_sort(v, is_less);
102    }
103}
104
105/// FIXME(const_trait_impl) use original ipnsort approach with choose_unstable_small_sort,
106/// as found here <https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36>.
107pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker {
108    fn small_sort_threshold() -> usize;
109
110    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
111}
112
113impl<T: FreezeMarker> UnstableSmallSortFreezeTypeImpl for T {
114    #[inline(always)]
115    default fn small_sort_threshold() -> usize {
116        if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
117            SMALL_SORT_GENERAL_THRESHOLD
118        } else {
119            SMALL_SORT_FALLBACK_THRESHOLD
120        }
121    }
122
123    #[inline(always)]
124    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
125    where
126        F: FnMut(&T, &T) -> bool,
127    {
128        if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
129            small_sort_general(v, is_less);
130        } else {
131            small_sort_fallback(v, is_less);
132        }
133    }
134}
135
136/// SAFETY: Only used for run-time optimization heuristic.
137#[rustc_unsafe_specialization_marker]
138trait CopyMarker {}
139
140impl<T: Copy> CopyMarker for T {}
141
142impl<T: FreezeMarker + CopyMarker> UnstableSmallSortFreezeTypeImpl for T {
143    #[inline(always)]
144    fn small_sort_threshold() -> usize {
145        if has_efficient_in_place_swap::<T>()
146            && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
147        {
148            SMALL_SORT_NETWORK_THRESHOLD
149        } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
150            SMALL_SORT_GENERAL_THRESHOLD
151        } else {
152            SMALL_SORT_FALLBACK_THRESHOLD
153        }
154    }
155
156    #[inline(always)]
157    fn small_sort<F>(v: &mut [T], is_less: &mut F)
158    where
159        F: FnMut(&T, &T) -> bool,
160    {
161        if has_efficient_in_place_swap::<T>()
162            && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
163        {
164            small_sort_network(v, is_less);
165        } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
166            small_sort_general(v, is_less);
167        } else {
168            small_sort_fallback(v, is_less);
169        }
170    }
171}
172
173/// Optimal number of comparisons, and good perf.
174const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
175
176/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
177/// wall-clock performance choosing 32 yielded better performance overall.
178///
179/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
180const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
181
182/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
183/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
184/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
185/// scratch buffer size.
186///
187/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
188pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
189
190/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
191const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
192const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
193
194/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
195/// conservative we limit the usage of small-sorts that require a stack array to types that fit
196/// within this limit.
197const MAX_STACK_ARRAY_SIZE: usize = 4096;
198
199fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
200    if v.len() >= 2 {
201        insertion_sort_shift_left(v, 1, is_less);
202    }
203}
204
205fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
206    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
207
208    // SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len
209    // is the same.
210    let scratch = unsafe {
211        slice::from_raw_parts_mut(
212            stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
213            SMALL_SORT_GENERAL_SCRATCH_LEN,
214        )
215    };
216
217    small_sort_general_with_scratch(v, scratch, is_less);
218}
219
220fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
221    v: &mut [T],
222    scratch: &mut [MaybeUninit<T>],
223    is_less: &mut F,
224) {
225    let len = v.len();
226    if len < 2 {
227        return;
228    }
229
230    if scratch.len() < len + 16 {
231        intrinsics::abort();
232    }
233
234    let v_base = v.as_mut_ptr();
235    let len_div_2 = len / 2;
236
237    // SAFETY: See individual comments.
238    unsafe {
239        let scratch_base = scratch.as_mut_ptr() as *mut T;
240
241        let presorted_len = if const { size_of::<T>() <= 16 } && len >= 16 {
242            // SAFETY: scratch_base is valid and has enough space.
243            sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
244            sort8_stable(
245                v_base.add(len_div_2),
246                scratch_base.add(len_div_2),
247                scratch_base.add(len + 8),
248                is_less,
249            );
250
251            8
252        } else if len >= 8 {
253            // SAFETY: scratch_base is valid and has enough space.
254            sort4_stable(v_base, scratch_base, is_less);
255            sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
256
257            4
258        } else {
259            ptr::copy_nonoverlapping(v_base, scratch_base, 1);
260            ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
261
262            1
263        };
264
265        for offset in [0, len_div_2] {
266            // SAFETY: at this point dst is initialized with presorted_len elements.
267            // We extend this to desired_len, src is valid for desired_len elements.
268            let src = v_base.add(offset);
269            let dst = scratch_base.add(offset);
270            let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
271
272            for i in presorted_len..desired_len {
273                ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
274                insert_tail(dst, dst.add(i), is_less);
275            }
276        }
277
278        // SAFETY: see comment in `CopyOnDrop::drop`.
279        let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
280
281        // SAFETY: at this point scratch_base is fully initialized, allowing us
282        // to use it as the source of our merge back into the original array.
283        // If a panic occurs we ensure the original array is restored to a valid
284        // permutation of the input through drop_guard. This technique is similar
285        // to ping-pong merging.
286        bidirectional_merge(
287            &*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
288            drop_guard.dst,
289            is_less,
290        );
291        mem::forget(drop_guard);
292    }
293}
294
295struct CopyOnDrop<T> {
296    src: *const T,
297    dst: *mut T,
298    len: usize,
299}
300
301impl<T> Drop for CopyOnDrop<T> {
302    fn drop(&mut self) {
303        // SAFETY: `src` must contain `len` initialized elements, and dst must
304        // be valid to write `len` elements.
305        unsafe {
306            ptr::copy_nonoverlapping(self.src, self.dst, self.len);
307        }
308    }
309}
310
311fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
312where
313    T: FreezeMarker,
314    F: FnMut(&T, &T) -> bool,
315{
316    // This implementation is tuned to be efficient for integer types.
317
318    let len = v.len();
319    if len < 2 {
320        return;
321    }
322
323    if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
324        intrinsics::abort();
325    }
326
327    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
328
329    let len_div_2 = len / 2;
330    let no_merge = len < 18;
331
332    let v_base = v.as_mut_ptr();
333    let initial_region_len = if no_merge { len } else { len_div_2 };
334    // SAFETY: Both possible values of `initial_region_len` are in-bounds.
335    let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
336
337    // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
338    loop {
339        let presorted_len = if region.len() >= 13 {
340            sort13_optimal(region, is_less);
341            13
342        } else if region.len() >= 9 {
343            sort9_optimal(region, is_less);
344            9
345        } else {
346            1
347        };
348
349        insertion_sort_shift_left(region, presorted_len, is_less);
350
351        if no_merge {
352            return;
353        }
354
355        if region.as_ptr() != v_base {
356            break;
357        }
358
359        // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
360        unsafe {
361            region = &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2)
362        };
363    }
364
365    // SAFETY: We checked that T is Freeze and thus observation safe.
366    // Should is_less panic v was not modified in parity_merge and retains it's original input.
367    // scratch and v must not alias and scratch has v.len() space.
368    unsafe {
369        let scratch_base = stack_array.as_mut_ptr() as *mut T;
370        bidirectional_merge(
371            &mut *ptr::slice_from_raw_parts_mut(v_base, len),
372            scratch_base,
373            is_less,
374        );
375        ptr::copy_nonoverlapping(scratch_base, v_base, len);
376    }
377}
378
379/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
380/// value at position `b_pos` is less than the one at position `a_pos`.
381///
382/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
383/// types. `is_less` could be a huge function and we want to give the compiler an option to
384/// not inline this function. For the same reasons that this function is very perf critical
385/// it should be in the same module as the functions that use it.
386unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
387where
388    F: FnMut(&T, &T) -> bool,
389{
390    // SAFETY: the caller must guarantee that `a_pos` and `b_pos` each added to `v_base` yield valid
391    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
392    unsafe {
393        let v_a = v_base.add(a_pos);
394        let v_b = v_base.add(b_pos);
395
396        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
397        // in a well defined state, without duplicates.
398
399        // Important to only swap if it is more and not if it is equal. is_less should return false for
400        // equal, so we don't swap.
401        let should_swap = is_less(&*v_b, &*v_a);
402
403        // This is a branchless version of swap if.
404        // The equivalent code with a branch would be:
405        //
406        // if should_swap {
407        //     ptr::swap(v_a, v_b, 1);
408        // }
409
410        // The goal is to generate cmov instructions here.
411        let v_a_swap = should_swap.select_unpredictable(v_b, v_a);
412        let v_b_swap = should_swap.select_unpredictable(v_a, v_b);
413
414        let v_b_swap_tmp = ManuallyDrop::new(ptr::read(v_b_swap));
415        ptr::copy(v_a_swap, v_a, 1);
416        ptr::copy_nonoverlapping(&*v_b_swap_tmp, v_b, 1);
417    }
418}
419
420/// Sorts the first 9 elements of `v` with a fast fixed function.
421///
422/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
423/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
424/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
425/// `small_sort_network` makes sure to only call this once.
426fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
427where
428    F: FnMut(&T, &T) -> bool,
429{
430    if v.len() < 9 {
431        intrinsics::abort();
432    }
433
434    let v_base = v.as_mut_ptr();
435
436    // Optimal sorting network see:
437    // https://bertdobbelaere.github.io/sorting_networks.html.
438
439    // SAFETY: We checked the len.
440    unsafe {
441        swap_if_less(v_base, 0, 3, is_less);
442        swap_if_less(v_base, 1, 7, is_less);
443        swap_if_less(v_base, 2, 5, is_less);
444        swap_if_less(v_base, 4, 8, is_less);
445        swap_if_less(v_base, 0, 7, is_less);
446        swap_if_less(v_base, 2, 4, is_less);
447        swap_if_less(v_base, 3, 8, is_less);
448        swap_if_less(v_base, 5, 6, is_less);
449        swap_if_less(v_base, 0, 2, is_less);
450        swap_if_less(v_base, 1, 3, is_less);
451        swap_if_less(v_base, 4, 5, is_less);
452        swap_if_less(v_base, 7, 8, is_less);
453        swap_if_less(v_base, 1, 4, is_less);
454        swap_if_less(v_base, 3, 6, is_less);
455        swap_if_less(v_base, 5, 7, is_less);
456        swap_if_less(v_base, 0, 1, is_less);
457        swap_if_less(v_base, 2, 4, is_less);
458        swap_if_less(v_base, 3, 5, is_less);
459        swap_if_less(v_base, 6, 8, is_less);
460        swap_if_less(v_base, 2, 3, is_less);
461        swap_if_less(v_base, 4, 5, is_less);
462        swap_if_less(v_base, 6, 7, is_less);
463        swap_if_less(v_base, 1, 2, is_less);
464        swap_if_less(v_base, 3, 4, is_less);
465        swap_if_less(v_base, 5, 6, is_less);
466    }
467}
468
469/// Sorts the first 13 elements of `v` with a fast fixed function.
470///
471/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
472/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
473/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
474/// `small_sort_network` makes sure to only call this once.
475fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
476where
477    F: FnMut(&T, &T) -> bool,
478{
479    if v.len() < 13 {
480        intrinsics::abort();
481    }
482
483    let v_base = v.as_mut_ptr();
484
485    // Optimal sorting network see:
486    // https://bertdobbelaere.github.io/sorting_networks.html.
487
488    // SAFETY: We checked the len.
489    unsafe {
490        swap_if_less(v_base, 0, 12, is_less);
491        swap_if_less(v_base, 1, 10, is_less);
492        swap_if_less(v_base, 2, 9, is_less);
493        swap_if_less(v_base, 3, 7, is_less);
494        swap_if_less(v_base, 5, 11, is_less);
495        swap_if_less(v_base, 6, 8, is_less);
496        swap_if_less(v_base, 1, 6, is_less);
497        swap_if_less(v_base, 2, 3, is_less);
498        swap_if_less(v_base, 4, 11, is_less);
499        swap_if_less(v_base, 7, 9, is_less);
500        swap_if_less(v_base, 8, 10, is_less);
501        swap_if_less(v_base, 0, 4, is_less);
502        swap_if_less(v_base, 1, 2, is_less);
503        swap_if_less(v_base, 3, 6, is_less);
504        swap_if_less(v_base, 7, 8, is_less);
505        swap_if_less(v_base, 9, 10, is_less);
506        swap_if_less(v_base, 11, 12, is_less);
507        swap_if_less(v_base, 4, 6, is_less);
508        swap_if_less(v_base, 5, 9, is_less);
509        swap_if_less(v_base, 8, 11, is_less);
510        swap_if_less(v_base, 10, 12, is_less);
511        swap_if_less(v_base, 0, 5, is_less);
512        swap_if_less(v_base, 3, 8, is_less);
513        swap_if_less(v_base, 4, 7, is_less);
514        swap_if_less(v_base, 6, 11, is_less);
515        swap_if_less(v_base, 9, 10, is_less);
516        swap_if_less(v_base, 0, 1, is_less);
517        swap_if_less(v_base, 2, 5, is_less);
518        swap_if_less(v_base, 6, 9, is_less);
519        swap_if_less(v_base, 7, 8, is_less);
520        swap_if_less(v_base, 10, 11, is_less);
521        swap_if_less(v_base, 1, 3, is_less);
522        swap_if_less(v_base, 2, 4, is_less);
523        swap_if_less(v_base, 5, 6, is_less);
524        swap_if_less(v_base, 9, 10, is_less);
525        swap_if_less(v_base, 1, 2, is_less);
526        swap_if_less(v_base, 3, 4, is_less);
527        swap_if_less(v_base, 5, 7, is_less);
528        swap_if_less(v_base, 6, 8, is_less);
529        swap_if_less(v_base, 2, 3, is_less);
530        swap_if_less(v_base, 4, 5, is_less);
531        swap_if_less(v_base, 6, 7, is_less);
532        swap_if_less(v_base, 8, 9, is_less);
533        swap_if_less(v_base, 3, 4, is_less);
534        swap_if_less(v_base, 5, 6, is_less);
535    }
536}
537
538/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
539///
540/// # Safety
541/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
542unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
543    // SAFETY: see individual comments.
544    unsafe {
545        // SAFETY: in-bounds as tail > begin.
546        let mut sift = tail.sub(1);
547        if !is_less(&*tail, &*sift) {
548            return;
549        }
550
551        // SAFETY: after this read tail is never read from again, as we only ever
552        // read from sift, sift < tail and we only ever decrease sift. Thus this is
553        // effectively a move, not a copy. Should a panic occur, or we have found
554        // the correct insertion position, gap_guard ensures the element is moved
555        // back into the array.
556        let tmp = ManuallyDrop::new(tail.read());
557        let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
558
559        loop {
560            // SAFETY: we move sift into the gap (which is valid), and point the
561            // gap guard destination at sift, ensuring that if a panic occurs the
562            // gap is once again filled.
563            ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
564            gap_guard.dst = sift;
565
566            if sift == begin {
567                break;
568            }
569
570            // SAFETY: we checked that sift != begin, thus this is in-bounds.
571            sift = sift.sub(1);
572            if !is_less(&tmp, &*sift) {
573                break;
574            }
575        }
576    }
577}
578
579/// Sort `v` assuming `v[..offset]` is already sorted.
580pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
581    v: &mut [T],
582    offset: usize,
583    is_less: &mut F,
584) {
585    let len = v.len();
586    if offset == 0 || offset > len {
587        intrinsics::abort();
588    }
589
590    // SAFETY: see individual comments.
591    unsafe {
592        // We write this basic loop directly using pointers, as when we use a
593        // for loop LLVM likes to unroll this loop which we do not want.
594        // SAFETY: v_end is the one-past-end pointer, and we checked that
595        // offset <= len, thus tail is also in-bounds.
596        let v_base = v.as_mut_ptr();
597        let v_end = v_base.add(len);
598        let mut tail = v_base.add(offset);
599        while tail != v_end {
600            // SAFETY: v_base and tail are both valid pointers to elements, and
601            // v_base < tail since we checked offset != 0.
602            insert_tail(v_base, tail, is_less);
603
604            // SAFETY: we checked that tail is not yet the one-past-end pointer.
605            tail = tail.add(1);
606        }
607    }
608}
609
610/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
611/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
612pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
613    v_base: *const T,
614    dst: *mut T,
615    is_less: &mut F,
616) {
617    // By limiting select to picking pointers, we are guaranteed good cmov code-gen
618    // regardless of type T's size. Further this only does 5 instead of 6
619    // comparisons compared to a stable transposition 4 element sorting-network,
620    // and always copies each element exactly once.
621
622    // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
623    // thus in-bounds by the precondition.
624    unsafe {
625        // Stably create two pairs a <= b and c <= d.
626        let c1 = is_less(&*v_base.add(1), &*v_base);
627        let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
628        let a = v_base.add(c1 as usize);
629        let b = v_base.add(!c1 as usize);
630        let c = v_base.add(2 + c2 as usize);
631        let d = v_base.add(2 + (!c2 as usize));
632
633        // Compare (a, c) and (b, d) to identify max/min. We're left with two
634        // unknown elements, but because we are a stable sort we must know which
635        // one is leftmost and which one is rightmost.
636        // c3, c4 | min max unknown_left unknown_right
637        //  0,  0 |  a   d    b         c
638        //  0,  1 |  a   b    c         d
639        //  1,  0 |  c   d    a         b
640        //  1,  1 |  c   b    a         d
641        let c3 = is_less(&*c, &*a);
642        let c4 = is_less(&*d, &*b);
643        let min = c3.select_unpredictable(c, a);
644        let max = c4.select_unpredictable(b, d);
645        let unknown_left = c3.select_unpredictable(a, c4.select_unpredictable(c, b));
646        let unknown_right = c4.select_unpredictable(d, c3.select_unpredictable(b, c));
647
648        // Sort the last two unknown elements.
649        let c5 = is_less(&*unknown_right, &*unknown_left);
650        let lo = c5.select_unpredictable(unknown_right, unknown_left);
651        let hi = c5.select_unpredictable(unknown_left, unknown_right);
652
653        ptr::copy_nonoverlapping(min, dst, 1);
654        ptr::copy_nonoverlapping(lo, dst.add(1), 1);
655        ptr::copy_nonoverlapping(hi, dst.add(2), 1);
656        ptr::copy_nonoverlapping(max, dst.add(3), 1);
657    }
658}
659
660/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
661/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
662/// be stored in `dst[0..8]`.
663unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
664    v_base: *mut T,
665    dst: *mut T,
666    scratch_base: *mut T,
667    is_less: &mut F,
668) {
669    // SAFETY: these pointers are all in-bounds by the precondition of our function.
670    unsafe {
671        sort4_stable(v_base, scratch_base, is_less);
672        sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
673    }
674
675    // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
676    // into dst.
677    unsafe {
678        bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
679    }
680}
681
682#[inline(always)]
683unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
684    mut left_src: *const T,
685    mut right_src: *const T,
686    mut dst: *mut T,
687    is_less: &mut F,
688) -> (*const T, *const T, *mut T) {
689    // This is a branchless merge utility function.
690    // The equivalent code with a branch would be:
691    //
692    // if !is_less(&*right_src, &*left_src) {
693    //     ptr::copy_nonoverlapping(left_src, dst, 1);
694    //     left_src = left_src.add(1);
695    // } else {
696    //     ptr::copy_nonoverlapping(right_src, dst, 1);
697    //     right_src = right_src.add(1);
698    // }
699    // dst = dst.add(1);
700
701    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
702    // to read and `dst` is valid to write, while not aliasing.
703    unsafe {
704        let is_l = !is_less(&*right_src, &*left_src);
705        let src = if is_l { left_src } else { right_src };
706        ptr::copy_nonoverlapping(src, dst, 1);
707        right_src = right_src.add(!is_l as usize);
708        left_src = left_src.add(is_l as usize);
709        dst = dst.add(1);
710    }
711
712    (left_src, right_src, dst)
713}
714
715#[inline(always)]
716unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
717    mut left_src: *const T,
718    mut right_src: *const T,
719    mut dst: *mut T,
720    is_less: &mut F,
721) -> (*const T, *const T, *mut T) {
722    // This is a branchless merge utility function.
723    // The equivalent code with a branch would be:
724    //
725    // if !is_less(&*right_src, &*left_src) {
726    //     ptr::copy_nonoverlapping(right_src, dst, 1);
727    //     right_src = right_src.wrapping_sub(1);
728    // } else {
729    //     ptr::copy_nonoverlapping(left_src, dst, 1);
730    //     left_src = left_src.wrapping_sub(1);
731    // }
732    // dst = dst.sub(1);
733
734    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
735    // to read and `dst` is valid to write, while not aliasing.
736    unsafe {
737        let is_l = !is_less(&*right_src, &*left_src);
738        let src = if is_l { right_src } else { left_src };
739        ptr::copy_nonoverlapping(src, dst, 1);
740        right_src = right_src.wrapping_sub(is_l as usize);
741        left_src = left_src.wrapping_sub(!is_l as usize);
742        dst = dst.sub(1);
743    }
744
745    (left_src, right_src, dst)
746}
747
748/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
749///
750/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
751/// adapted to only use merge up and down. In contrast to the original
752/// parity_merge function, it performs 2 writes instead of 4 per iteration.
753///
754/// # Safety
755/// The caller must guarantee that `dst` is valid for v.len() writes.
756/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
757///
758/// Note that T must be Freeze, the comparison function is evaluated on outdated
759/// temporary 'copies' that may not end up in the final array.
760unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
761    v: &[T],
762    dst: *mut T,
763    is_less: &mut F,
764) {
765    // It helps to visualize the merge:
766    //
767    // Initial:
768    //
769    //  |dst (in dst)
770    //  |left               |right
771    //  v                   v
772    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
773    //                     ^                   ^
774    //                     |left_rev           |right_rev
775    //                                         |dst_rev (in dst)
776    //
777    // After:
778    //
779    //                      |dst (in dst)
780    //        |left         |           |right
781    //        v             v           v
782    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
783    //       ^             ^           ^
784    //       |left_rev     |           |right_rev
785    //                     |dst_rev (in dst)
786    //
787    // In each iteration one of left or right moves up one position, and one of
788    // left_rev or right_rev moves down one position, whereas dst always moves
789    // up one position and dst_rev always moves down one position. Assuming
790    // the input was sorted and the comparison function is correctly implemented
791    // at the end we will have left == left_rev + 1, and right == right_rev + 1,
792    // fully consuming the input having written it to dst.
793
794    let len = v.len();
795    let src = v.as_ptr();
796
797    let len_div_2 = len / 2;
798
799    // SAFETY: The caller has to ensure that len >= 2.
800    unsafe {
801        intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
802    }
803
804    // SAFETY: no matter what the result of the user-provided comparison function
805    // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
806    // will always be in bounds if the caller guarantees that `dst` is valid for
807    // `v.len()` writes.
808    unsafe {
809        let mut left = src;
810        let mut right = src.add(len_div_2);
811        let mut dst = dst;
812
813        let mut left_rev = src.add(len_div_2 - 1);
814        let mut right_rev = src.add(len - 1);
815        let mut dst_rev = dst.add(len - 1);
816
817        for _ in 0..len_div_2 {
818            (left, right, dst) = merge_up(left, right, dst, is_less);
819            (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
820        }
821
822        let left_end = left_rev.wrapping_add(1);
823        let right_end = right_rev.wrapping_add(1);
824
825        // Odd length, so one element is left unconsumed in the input.
826        if len % 2 != 0 {
827            let left_nonempty = left < left_end;
828            let last_src = if left_nonempty { left } else { right };
829            ptr::copy_nonoverlapping(last_src, dst, 1);
830            left = left.add(left_nonempty as usize);
831            right = right.add((!left_nonempty) as usize);
832        }
833
834        // We now should have consumed the full input exactly once. This can only fail if the
835        // user-provided comparison function fails to implement a strict weak ordering. In that case
836        // we panic and never access the inconsistent state in dst.
837        if left != left_end || right != right_end {
838            panic_on_ord_violation();
839        }
840    }
841}
842
843#[cfg_attr(not(feature = "panic_immediate_abort"), inline(never), cold)]
844#[cfg_attr(feature = "panic_immediate_abort", inline)]
845fn panic_on_ord_violation() -> ! {
846    // This is indicative of a logic bug in the user-provided comparison function or Ord
847    // implementation. They are expected to implement a total order as explained in the Ord
848    // documentation.
849    //
850    // By panicking we inform the user, that they have a logic bug in their program. If a strict
851    // weak ordering is not given, the concept of comparison based sorting cannot yield a sorted
852    // result. E.g.: a < b < c < a
853    //
854    // The Ord documentation requires users to implement a total order. Arguably that's
855    // unnecessarily strict in the context of sorting. Issues only arise if the weaker requirement
856    // of a strict weak ordering is violated.
857    //
858    // The panic message talks about a total order because that's what the Ord documentation talks
859    // about and requires, so as to not confuse users.
860    panic!("user-provided comparison function does not correctly implement a total order");
861}
862
863#[must_use]
864pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
865    // Heuristic that holds true on all tested 64-bit capable architectures.
866    size_of::<T>() <= 8 // size_of::<u64>()
867}