core/slice/sort/shared/smallsort.rs
1//! This module contains a variety of sort implementations that are optimized for small lengths.
2
3use crate::mem::{self, ManuallyDrop, MaybeUninit};
4use crate::slice::sort::shared::FreezeMarker;
5use crate::{intrinsics, ptr, slice};
6
7// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
8// small slices and small-sort performance sorting small sub-slices as part of
9// the main quicksort loop. For the former, testing showed that the
10// representative benchmarks for real-world performance are cold CPU state and
11// not single-size hot benchmarks. For the latter the CPU will call them many
12// times, so hot benchmarks are fine and more realistic. And it's worth it to
13// optimize sorting small sub-slices with more sophisticated solutions than
14// insertion sort.
15
16/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
17/// abstractions.
18pub(crate) trait StableSmallSortTypeImpl: Sized {
19 /// For which input length <= return value of this function, is it valid to call `small_sort`.
20 fn small_sort_threshold() -> usize;
21
22 /// Sorts `v` using strategies optimized for small sizes.
23 fn small_sort<F: FnMut(&Self, &Self) -> bool>(
24 v: &mut [Self],
25 scratch: &mut [MaybeUninit<Self>],
26 is_less: &mut F,
27 );
28}
29
30impl<T> StableSmallSortTypeImpl for T {
31 #[inline(always)]
32 default fn small_sort_threshold() -> usize {
33 // Optimal number of comparisons, and good perf.
34 SMALL_SORT_FALLBACK_THRESHOLD
35 }
36
37 #[inline(always)]
38 default fn small_sort<F: FnMut(&T, &T) -> bool>(
39 v: &mut [T],
40 _scratch: &mut [MaybeUninit<T>],
41 is_less: &mut F,
42 ) {
43 if v.len() >= 2 {
44 insertion_sort_shift_left(v, 1, is_less);
45 }
46 }
47}
48
49impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
50 #[inline(always)]
51 fn small_sort_threshold() -> usize {
52 SMALL_SORT_GENERAL_THRESHOLD
53 }
54
55 #[inline(always)]
56 fn small_sort<F: FnMut(&T, &T) -> bool>(
57 v: &mut [T],
58 scratch: &mut [MaybeUninit<T>],
59 is_less: &mut F,
60 ) {
61 small_sort_general_with_scratch(v, scratch, is_less);
62 }
63}
64
65/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
66/// abstractions.
67pub(crate) trait UnstableSmallSortTypeImpl: Sized {
68 /// For which input length <= return value of this function, is it valid to call `small_sort`.
69 fn small_sort_threshold() -> usize;
70
71 /// Sorts `v` using strategies optimized for small sizes.
72 fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
73}
74
75impl<T> UnstableSmallSortTypeImpl for T {
76 #[inline(always)]
77 default fn small_sort_threshold() -> usize {
78 SMALL_SORT_FALLBACK_THRESHOLD
79 }
80
81 #[inline(always)]
82 default fn small_sort<F>(v: &mut [T], is_less: &mut F)
83 where
84 F: FnMut(&T, &T) -> bool,
85 {
86 small_sort_fallback(v, is_less);
87 }
88}
89
90impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
91 #[inline(always)]
92 fn small_sort_threshold() -> usize {
93 <T as UnstableSmallSortFreezeTypeImpl>::small_sort_threshold()
94 }
95
96 #[inline(always)]
97 fn small_sort<F>(v: &mut [T], is_less: &mut F)
98 where
99 F: FnMut(&T, &T) -> bool,
100 {
101 <T as UnstableSmallSortFreezeTypeImpl>::small_sort(v, is_less);
102 }
103}
104
105/// FIXME(const_trait_impl) use original ipnsort approach with choose_unstable_small_sort,
106/// as found here <https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36>.
107pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker {
108 fn small_sort_threshold() -> usize;
109
110 fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
111}
112
113impl<T: FreezeMarker> UnstableSmallSortFreezeTypeImpl for T {
114 #[inline(always)]
115 default fn small_sort_threshold() -> usize {
116 if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
117 SMALL_SORT_GENERAL_THRESHOLD
118 } else {
119 SMALL_SORT_FALLBACK_THRESHOLD
120 }
121 }
122
123 #[inline(always)]
124 default fn small_sort<F>(v: &mut [T], is_less: &mut F)
125 where
126 F: FnMut(&T, &T) -> bool,
127 {
128 if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
129 small_sort_general(v, is_less);
130 } else {
131 small_sort_fallback(v, is_less);
132 }
133 }
134}
135
136/// SAFETY: Only used for run-time optimization heuristic.
137#[rustc_unsafe_specialization_marker]
138trait CopyMarker {}
139
140impl<T: Copy> CopyMarker for T {}
141
142impl<T: FreezeMarker + CopyMarker> UnstableSmallSortFreezeTypeImpl for T {
143 #[inline(always)]
144 fn small_sort_threshold() -> usize {
145 if has_efficient_in_place_swap::<T>()
146 && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
147 {
148 SMALL_SORT_NETWORK_THRESHOLD
149 } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
150 SMALL_SORT_GENERAL_THRESHOLD
151 } else {
152 SMALL_SORT_FALLBACK_THRESHOLD
153 }
154 }
155
156 #[inline(always)]
157 fn small_sort<F>(v: &mut [T], is_less: &mut F)
158 where
159 F: FnMut(&T, &T) -> bool,
160 {
161 if has_efficient_in_place_swap::<T>()
162 && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
163 {
164 small_sort_network(v, is_less);
165 } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
166 small_sort_general(v, is_less);
167 } else {
168 small_sort_fallback(v, is_less);
169 }
170 }
171}
172
173/// Optimal number of comparisons, and good perf.
174const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
175
176/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
177/// wall-clock performance choosing 32 yielded better performance overall.
178///
179/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
180const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
181
182/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
183/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
184/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
185/// scratch buffer size.
186///
187/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
188pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
189
190/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
191const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
192const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
193
194/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
195/// conservative we limit the usage of small-sorts that require a stack array to types that fit
196/// within this limit.
197const MAX_STACK_ARRAY_SIZE: usize = 4096;
198
199fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
200 if v.len() >= 2 {
201 insertion_sort_shift_left(v, 1, is_less);
202 }
203}
204
205fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
206 let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
207
208 // SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len
209 // is the same.
210 let scratch = unsafe {
211 slice::from_raw_parts_mut(
212 stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
213 SMALL_SORT_GENERAL_SCRATCH_LEN,
214 )
215 };
216
217 small_sort_general_with_scratch(v, scratch, is_less);
218}
219
220fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
221 v: &mut [T],
222 scratch: &mut [MaybeUninit<T>],
223 is_less: &mut F,
224) {
225 let len = v.len();
226 if len < 2 {
227 return;
228 }
229
230 if scratch.len() < len + 16 {
231 intrinsics::abort();
232 }
233
234 let v_base = v.as_mut_ptr();
235 let len_div_2 = len / 2;
236
237 // SAFETY: See individual comments.
238 unsafe {
239 let scratch_base = scratch.as_mut_ptr() as *mut T;
240
241 let presorted_len = if const { size_of::<T>() <= 16 } && len >= 16 {
242 // SAFETY: scratch_base is valid and has enough space.
243 sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
244 sort8_stable(
245 v_base.add(len_div_2),
246 scratch_base.add(len_div_2),
247 scratch_base.add(len + 8),
248 is_less,
249 );
250
251 8
252 } else if len >= 8 {
253 // SAFETY: scratch_base is valid and has enough space.
254 sort4_stable(v_base, scratch_base, is_less);
255 sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
256
257 4
258 } else {
259 ptr::copy_nonoverlapping(v_base, scratch_base, 1);
260 ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
261
262 1
263 };
264
265 for offset in [0, len_div_2] {
266 // SAFETY: at this point dst is initialized with presorted_len elements.
267 // We extend this to desired_len, src is valid for desired_len elements.
268 let src = v_base.add(offset);
269 let dst = scratch_base.add(offset);
270 let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
271
272 for i in presorted_len..desired_len {
273 ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
274 insert_tail(dst, dst.add(i), is_less);
275 }
276 }
277
278 // SAFETY: see comment in `CopyOnDrop::drop`.
279 let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
280
281 // SAFETY: at this point scratch_base is fully initialized, allowing us
282 // to use it as the source of our merge back into the original array.
283 // If a panic occurs we ensure the original array is restored to a valid
284 // permutation of the input through drop_guard. This technique is similar
285 // to ping-pong merging.
286 bidirectional_merge(
287 &*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
288 drop_guard.dst,
289 is_less,
290 );
291 mem::forget(drop_guard);
292 }
293}
294
295struct CopyOnDrop<T> {
296 src: *const T,
297 dst: *mut T,
298 len: usize,
299}
300
301impl<T> Drop for CopyOnDrop<T> {
302 fn drop(&mut self) {
303 // SAFETY: `src` must contain `len` initialized elements, and dst must
304 // be valid to write `len` elements.
305 unsafe {
306 ptr::copy_nonoverlapping(self.src, self.dst, self.len);
307 }
308 }
309}
310
311fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
312where
313 T: FreezeMarker,
314 F: FnMut(&T, &T) -> bool,
315{
316 // This implementation is tuned to be efficient for integer types.
317
318 let len = v.len();
319 if len < 2 {
320 return;
321 }
322
323 if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
324 intrinsics::abort();
325 }
326
327 let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
328
329 let len_div_2 = len / 2;
330 let no_merge = len < 18;
331
332 let v_base = v.as_mut_ptr();
333 let initial_region_len = if no_merge { len } else { len_div_2 };
334 // SAFETY: Both possible values of `initial_region_len` are in-bounds.
335 let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
336
337 // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
338 loop {
339 let presorted_len = if region.len() >= 13 {
340 sort13_optimal(region, is_less);
341 13
342 } else if region.len() >= 9 {
343 sort9_optimal(region, is_less);
344 9
345 } else {
346 1
347 };
348
349 insertion_sort_shift_left(region, presorted_len, is_less);
350
351 if no_merge {
352 return;
353 }
354
355 if region.as_ptr() != v_base {
356 break;
357 }
358
359 // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
360 unsafe {
361 region = &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2)
362 };
363 }
364
365 // SAFETY: We checked that T is Freeze and thus observation safe.
366 // Should is_less panic v was not modified in parity_merge and retains it's original input.
367 // scratch and v must not alias and scratch has v.len() space.
368 unsafe {
369 let scratch_base = stack_array.as_mut_ptr() as *mut T;
370 bidirectional_merge(
371 &mut *ptr::slice_from_raw_parts_mut(v_base, len),
372 scratch_base,
373 is_less,
374 );
375 ptr::copy_nonoverlapping(scratch_base, v_base, len);
376 }
377}
378
379/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
380/// value at position `b_pos` is less than the one at position `a_pos`.
381///
382/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
383/// types. `is_less` could be a huge function and we want to give the compiler an option to
384/// not inline this function. For the same reasons that this function is very perf critical
385/// it should be in the same module as the functions that use it.
386unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
387where
388 F: FnMut(&T, &T) -> bool,
389{
390 // SAFETY: the caller must guarantee that `a_pos` and `b_pos` each added to `v_base` yield valid
391 // pointers into `v_base`, and are properly aligned, and part of the same allocation.
392 unsafe {
393 let v_a = v_base.add(a_pos);
394 let v_b = v_base.add(b_pos);
395
396 // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
397 // in a well defined state, without duplicates.
398
399 // Important to only swap if it is more and not if it is equal. is_less should return false for
400 // equal, so we don't swap.
401 let should_swap = is_less(&*v_b, &*v_a);
402
403 // This is a branchless version of swap if.
404 // The equivalent code with a branch would be:
405 //
406 // if should_swap {
407 // ptr::swap(v_a, v_b, 1);
408 // }
409
410 // The goal is to generate cmov instructions here.
411 let v_a_swap = should_swap.select_unpredictable(v_b, v_a);
412 let v_b_swap = should_swap.select_unpredictable(v_a, v_b);
413
414 let v_b_swap_tmp = ManuallyDrop::new(ptr::read(v_b_swap));
415 ptr::copy(v_a_swap, v_a, 1);
416 ptr::copy_nonoverlapping(&*v_b_swap_tmp, v_b, 1);
417 }
418}
419
420/// Sorts the first 9 elements of `v` with a fast fixed function.
421///
422/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
423/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
424/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
425/// `small_sort_network` makes sure to only call this once.
426fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
427where
428 F: FnMut(&T, &T) -> bool,
429{
430 if v.len() < 9 {
431 intrinsics::abort();
432 }
433
434 let v_base = v.as_mut_ptr();
435
436 // Optimal sorting network see:
437 // https://bertdobbelaere.github.io/sorting_networks.html.
438
439 // SAFETY: We checked the len.
440 unsafe {
441 swap_if_less(v_base, 0, 3, is_less);
442 swap_if_less(v_base, 1, 7, is_less);
443 swap_if_less(v_base, 2, 5, is_less);
444 swap_if_less(v_base, 4, 8, is_less);
445 swap_if_less(v_base, 0, 7, is_less);
446 swap_if_less(v_base, 2, 4, is_less);
447 swap_if_less(v_base, 3, 8, is_less);
448 swap_if_less(v_base, 5, 6, is_less);
449 swap_if_less(v_base, 0, 2, is_less);
450 swap_if_less(v_base, 1, 3, is_less);
451 swap_if_less(v_base, 4, 5, is_less);
452 swap_if_less(v_base, 7, 8, is_less);
453 swap_if_less(v_base, 1, 4, is_less);
454 swap_if_less(v_base, 3, 6, is_less);
455 swap_if_less(v_base, 5, 7, is_less);
456 swap_if_less(v_base, 0, 1, is_less);
457 swap_if_less(v_base, 2, 4, is_less);
458 swap_if_less(v_base, 3, 5, is_less);
459 swap_if_less(v_base, 6, 8, is_less);
460 swap_if_less(v_base, 2, 3, is_less);
461 swap_if_less(v_base, 4, 5, is_less);
462 swap_if_less(v_base, 6, 7, is_less);
463 swap_if_less(v_base, 1, 2, is_less);
464 swap_if_less(v_base, 3, 4, is_less);
465 swap_if_less(v_base, 5, 6, is_less);
466 }
467}
468
469/// Sorts the first 13 elements of `v` with a fast fixed function.
470///
471/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
472/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
473/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
474/// `small_sort_network` makes sure to only call this once.
475fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
476where
477 F: FnMut(&T, &T) -> bool,
478{
479 if v.len() < 13 {
480 intrinsics::abort();
481 }
482
483 let v_base = v.as_mut_ptr();
484
485 // Optimal sorting network see:
486 // https://bertdobbelaere.github.io/sorting_networks.html.
487
488 // SAFETY: We checked the len.
489 unsafe {
490 swap_if_less(v_base, 0, 12, is_less);
491 swap_if_less(v_base, 1, 10, is_less);
492 swap_if_less(v_base, 2, 9, is_less);
493 swap_if_less(v_base, 3, 7, is_less);
494 swap_if_less(v_base, 5, 11, is_less);
495 swap_if_less(v_base, 6, 8, is_less);
496 swap_if_less(v_base, 1, 6, is_less);
497 swap_if_less(v_base, 2, 3, is_less);
498 swap_if_less(v_base, 4, 11, is_less);
499 swap_if_less(v_base, 7, 9, is_less);
500 swap_if_less(v_base, 8, 10, is_less);
501 swap_if_less(v_base, 0, 4, is_less);
502 swap_if_less(v_base, 1, 2, is_less);
503 swap_if_less(v_base, 3, 6, is_less);
504 swap_if_less(v_base, 7, 8, is_less);
505 swap_if_less(v_base, 9, 10, is_less);
506 swap_if_less(v_base, 11, 12, is_less);
507 swap_if_less(v_base, 4, 6, is_less);
508 swap_if_less(v_base, 5, 9, is_less);
509 swap_if_less(v_base, 8, 11, is_less);
510 swap_if_less(v_base, 10, 12, is_less);
511 swap_if_less(v_base, 0, 5, is_less);
512 swap_if_less(v_base, 3, 8, is_less);
513 swap_if_less(v_base, 4, 7, is_less);
514 swap_if_less(v_base, 6, 11, is_less);
515 swap_if_less(v_base, 9, 10, is_less);
516 swap_if_less(v_base, 0, 1, is_less);
517 swap_if_less(v_base, 2, 5, is_less);
518 swap_if_less(v_base, 6, 9, is_less);
519 swap_if_less(v_base, 7, 8, is_less);
520 swap_if_less(v_base, 10, 11, is_less);
521 swap_if_less(v_base, 1, 3, is_less);
522 swap_if_less(v_base, 2, 4, is_less);
523 swap_if_less(v_base, 5, 6, is_less);
524 swap_if_less(v_base, 9, 10, is_less);
525 swap_if_less(v_base, 1, 2, is_less);
526 swap_if_less(v_base, 3, 4, is_less);
527 swap_if_less(v_base, 5, 7, is_less);
528 swap_if_less(v_base, 6, 8, is_less);
529 swap_if_less(v_base, 2, 3, is_less);
530 swap_if_less(v_base, 4, 5, is_less);
531 swap_if_less(v_base, 6, 7, is_less);
532 swap_if_less(v_base, 8, 9, is_less);
533 swap_if_less(v_base, 3, 4, is_less);
534 swap_if_less(v_base, 5, 6, is_less);
535 }
536}
537
538/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
539///
540/// # Safety
541/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
542unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
543 // SAFETY: see individual comments.
544 unsafe {
545 // SAFETY: in-bounds as tail > begin.
546 let mut sift = tail.sub(1);
547 if !is_less(&*tail, &*sift) {
548 return;
549 }
550
551 // SAFETY: after this read tail is never read from again, as we only ever
552 // read from sift, sift < tail and we only ever decrease sift. Thus this is
553 // effectively a move, not a copy. Should a panic occur, or we have found
554 // the correct insertion position, gap_guard ensures the element is moved
555 // back into the array.
556 let tmp = ManuallyDrop::new(tail.read());
557 let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
558
559 loop {
560 // SAFETY: we move sift into the gap (which is valid), and point the
561 // gap guard destination at sift, ensuring that if a panic occurs the
562 // gap is once again filled.
563 ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
564 gap_guard.dst = sift;
565
566 if sift == begin {
567 break;
568 }
569
570 // SAFETY: we checked that sift != begin, thus this is in-bounds.
571 sift = sift.sub(1);
572 if !is_less(&tmp, &*sift) {
573 break;
574 }
575 }
576 }
577}
578
579/// Sort `v` assuming `v[..offset]` is already sorted.
580pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
581 v: &mut [T],
582 offset: usize,
583 is_less: &mut F,
584) {
585 let len = v.len();
586 if offset == 0 || offset > len {
587 intrinsics::abort();
588 }
589
590 // SAFETY: see individual comments.
591 unsafe {
592 // We write this basic loop directly using pointers, as when we use a
593 // for loop LLVM likes to unroll this loop which we do not want.
594 // SAFETY: v_end is the one-past-end pointer, and we checked that
595 // offset <= len, thus tail is also in-bounds.
596 let v_base = v.as_mut_ptr();
597 let v_end = v_base.add(len);
598 let mut tail = v_base.add(offset);
599 while tail != v_end {
600 // SAFETY: v_base and tail are both valid pointers to elements, and
601 // v_base < tail since we checked offset != 0.
602 insert_tail(v_base, tail, is_less);
603
604 // SAFETY: we checked that tail is not yet the one-past-end pointer.
605 tail = tail.add(1);
606 }
607 }
608}
609
610/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
611/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
612pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
613 v_base: *const T,
614 dst: *mut T,
615 is_less: &mut F,
616) {
617 // By limiting select to picking pointers, we are guaranteed good cmov code-gen
618 // regardless of type T's size. Further this only does 5 instead of 6
619 // comparisons compared to a stable transposition 4 element sorting-network,
620 // and always copies each element exactly once.
621
622 // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
623 // thus in-bounds by the precondition.
624 unsafe {
625 // Stably create two pairs a <= b and c <= d.
626 let c1 = is_less(&*v_base.add(1), &*v_base);
627 let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
628 let a = v_base.add(c1 as usize);
629 let b = v_base.add(!c1 as usize);
630 let c = v_base.add(2 + c2 as usize);
631 let d = v_base.add(2 + (!c2 as usize));
632
633 // Compare (a, c) and (b, d) to identify max/min. We're left with two
634 // unknown elements, but because we are a stable sort we must know which
635 // one is leftmost and which one is rightmost.
636 // c3, c4 | min max unknown_left unknown_right
637 // 0, 0 | a d b c
638 // 0, 1 | a b c d
639 // 1, 0 | c d a b
640 // 1, 1 | c b a d
641 let c3 = is_less(&*c, &*a);
642 let c4 = is_less(&*d, &*b);
643 let min = c3.select_unpredictable(c, a);
644 let max = c4.select_unpredictable(b, d);
645 let unknown_left = c3.select_unpredictable(a, c4.select_unpredictable(c, b));
646 let unknown_right = c4.select_unpredictable(d, c3.select_unpredictable(b, c));
647
648 // Sort the last two unknown elements.
649 let c5 = is_less(&*unknown_right, &*unknown_left);
650 let lo = c5.select_unpredictable(unknown_right, unknown_left);
651 let hi = c5.select_unpredictable(unknown_left, unknown_right);
652
653 ptr::copy_nonoverlapping(min, dst, 1);
654 ptr::copy_nonoverlapping(lo, dst.add(1), 1);
655 ptr::copy_nonoverlapping(hi, dst.add(2), 1);
656 ptr::copy_nonoverlapping(max, dst.add(3), 1);
657 }
658}
659
660/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
661/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
662/// be stored in `dst[0..8]`.
663unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
664 v_base: *mut T,
665 dst: *mut T,
666 scratch_base: *mut T,
667 is_less: &mut F,
668) {
669 // SAFETY: these pointers are all in-bounds by the precondition of our function.
670 unsafe {
671 sort4_stable(v_base, scratch_base, is_less);
672 sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
673 }
674
675 // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
676 // into dst.
677 unsafe {
678 bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
679 }
680}
681
682#[inline(always)]
683unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
684 mut left_src: *const T,
685 mut right_src: *const T,
686 mut dst: *mut T,
687 is_less: &mut F,
688) -> (*const T, *const T, *mut T) {
689 // This is a branchless merge utility function.
690 // The equivalent code with a branch would be:
691 //
692 // if !is_less(&*right_src, &*left_src) {
693 // ptr::copy_nonoverlapping(left_src, dst, 1);
694 // left_src = left_src.add(1);
695 // } else {
696 // ptr::copy_nonoverlapping(right_src, dst, 1);
697 // right_src = right_src.add(1);
698 // }
699 // dst = dst.add(1);
700
701 // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
702 // to read and `dst` is valid to write, while not aliasing.
703 unsafe {
704 let is_l = !is_less(&*right_src, &*left_src);
705 let src = if is_l { left_src } else { right_src };
706 ptr::copy_nonoverlapping(src, dst, 1);
707 right_src = right_src.add(!is_l as usize);
708 left_src = left_src.add(is_l as usize);
709 dst = dst.add(1);
710 }
711
712 (left_src, right_src, dst)
713}
714
715#[inline(always)]
716unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
717 mut left_src: *const T,
718 mut right_src: *const T,
719 mut dst: *mut T,
720 is_less: &mut F,
721) -> (*const T, *const T, *mut T) {
722 // This is a branchless merge utility function.
723 // The equivalent code with a branch would be:
724 //
725 // if !is_less(&*right_src, &*left_src) {
726 // ptr::copy_nonoverlapping(right_src, dst, 1);
727 // right_src = right_src.wrapping_sub(1);
728 // } else {
729 // ptr::copy_nonoverlapping(left_src, dst, 1);
730 // left_src = left_src.wrapping_sub(1);
731 // }
732 // dst = dst.sub(1);
733
734 // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
735 // to read and `dst` is valid to write, while not aliasing.
736 unsafe {
737 let is_l = !is_less(&*right_src, &*left_src);
738 let src = if is_l { right_src } else { left_src };
739 ptr::copy_nonoverlapping(src, dst, 1);
740 right_src = right_src.wrapping_sub(is_l as usize);
741 left_src = left_src.wrapping_sub(!is_l as usize);
742 dst = dst.sub(1);
743 }
744
745 (left_src, right_src, dst)
746}
747
748/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
749///
750/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
751/// adapted to only use merge up and down. In contrast to the original
752/// parity_merge function, it performs 2 writes instead of 4 per iteration.
753///
754/// # Safety
755/// The caller must guarantee that `dst` is valid for v.len() writes.
756/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
757///
758/// Note that T must be Freeze, the comparison function is evaluated on outdated
759/// temporary 'copies' that may not end up in the final array.
760unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
761 v: &[T],
762 dst: *mut T,
763 is_less: &mut F,
764) {
765 // It helps to visualize the merge:
766 //
767 // Initial:
768 //
769 // |dst (in dst)
770 // |left |right
771 // v v
772 // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
773 // ^ ^
774 // |left_rev |right_rev
775 // |dst_rev (in dst)
776 //
777 // After:
778 //
779 // |dst (in dst)
780 // |left | |right
781 // v v v
782 // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
783 // ^ ^ ^
784 // |left_rev | |right_rev
785 // |dst_rev (in dst)
786 //
787 // In each iteration one of left or right moves up one position, and one of
788 // left_rev or right_rev moves down one position, whereas dst always moves
789 // up one position and dst_rev always moves down one position. Assuming
790 // the input was sorted and the comparison function is correctly implemented
791 // at the end we will have left == left_rev + 1, and right == right_rev + 1,
792 // fully consuming the input having written it to dst.
793
794 let len = v.len();
795 let src = v.as_ptr();
796
797 let len_div_2 = len / 2;
798
799 // SAFETY: The caller has to ensure that len >= 2.
800 unsafe {
801 intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
802 }
803
804 // SAFETY: no matter what the result of the user-provided comparison function
805 // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
806 // will always be in bounds if the caller guarantees that `dst` is valid for
807 // `v.len()` writes.
808 unsafe {
809 let mut left = src;
810 let mut right = src.add(len_div_2);
811 let mut dst = dst;
812
813 let mut left_rev = src.add(len_div_2 - 1);
814 let mut right_rev = src.add(len - 1);
815 let mut dst_rev = dst.add(len - 1);
816
817 for _ in 0..len_div_2 {
818 (left, right, dst) = merge_up(left, right, dst, is_less);
819 (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
820 }
821
822 let left_end = left_rev.wrapping_add(1);
823 let right_end = right_rev.wrapping_add(1);
824
825 // Odd length, so one element is left unconsumed in the input.
826 if len % 2 != 0 {
827 let left_nonempty = left < left_end;
828 let last_src = if left_nonempty { left } else { right };
829 ptr::copy_nonoverlapping(last_src, dst, 1);
830 left = left.add(left_nonempty as usize);
831 right = right.add((!left_nonempty) as usize);
832 }
833
834 // We now should have consumed the full input exactly once. This can only fail if the
835 // user-provided comparison function fails to implement a strict weak ordering. In that case
836 // we panic and never access the inconsistent state in dst.
837 if left != left_end || right != right_end {
838 panic_on_ord_violation();
839 }
840 }
841}
842
843#[cfg_attr(not(feature = "panic_immediate_abort"), inline(never), cold)]
844#[cfg_attr(feature = "panic_immediate_abort", inline)]
845fn panic_on_ord_violation() -> ! {
846 // This is indicative of a logic bug in the user-provided comparison function or Ord
847 // implementation. They are expected to implement a total order as explained in the Ord
848 // documentation.
849 //
850 // By panicking we inform the user, that they have a logic bug in their program. If a strict
851 // weak ordering is not given, the concept of comparison based sorting cannot yield a sorted
852 // result. E.g.: a < b < c < a
853 //
854 // The Ord documentation requires users to implement a total order. Arguably that's
855 // unnecessarily strict in the context of sorting. Issues only arise if the weaker requirement
856 // of a strict weak ordering is violated.
857 //
858 // The panic message talks about a total order because that's what the Ord documentation talks
859 // about and requires, so as to not confuse users.
860 panic!("user-provided comparison function does not correctly implement a total order");
861}
862
863#[must_use]
864pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
865 // Heuristic that holds true on all tested 64-bit capable architectures.
866 size_of::<T>() <= 8 // size_of::<u64>()
867}