Skip to content

Commit

Permalink
Add binary-size optimized variants for stable and unstable sort as we…
Browse files Browse the repository at this point in the history
…ll as select_nth_unstable

- Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function.
- Unstable sort jumps directly to the branchless heapsort fallback.
- select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a
  custom tiny smallsort and partition impl.

Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less`
is critical for performance, if the sorting networks don't inline it perf drops drastically,
however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge,
it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect
that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in
in smallsort module.
  • Loading branch information
Voultapher committed Aug 25, 2024
1 parent e2614f2 commit 1805c29
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 33 deletions.
1 change: 1 addition & 0 deletions core/src/slice/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ pub mod stable;
pub mod unstable;

pub(crate) mod select;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod shared;
125 changes: 122 additions & 3 deletions core/src/slice/sort/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@
//! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
//! better performance than one would get using heapsort as fallback.
use crate::intrinsics;
use crate::mem::{self, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::pivot::choose_pivot;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::unstable::quicksort::partition;

/// Reorders the slice such that the element at `index` is at its final sorted position.
Expand Down Expand Up @@ -40,7 +44,15 @@ where
let min_idx = min_index(v, &mut is_less).unwrap();
v.swap(min_idx, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
#[cfg(not(feature = "optimize_for_size"))]
{
partition_at_index_loop(v, index, None, &mut is_less);
}

#[cfg(feature = "optimize_for_size")]
{
median_of_medians(v, &mut is_less, index);
}
}

let (left, right) = v.split_at_mut(index);
Expand All @@ -53,6 +65,7 @@ where
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
const INSERTION_SORT_THRESHOLD: usize = 16;

#[cfg(not(feature = "optimize_for_size"))]
fn partition_at_index_loop<'a, T, F>(
mut v: &'a mut [T],
mut index: usize,
Expand Down Expand Up @@ -167,8 +180,17 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
loop {
if v.len() <= INSERTION_SORT_THRESHOLD {
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
#[cfg(not(feature = "optimize_for_size"))]
{
insertion_sort_shift_left(v, 1, is_less);
}

#[cfg(feature = "optimize_for_size")]
{
bubble_sort(v, is_less);
}
}

return;
}

Expand Down Expand Up @@ -230,7 +252,15 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)

median_of_medians(&mut v[lo..lo + frac], is_less, pivot);

partition(v, lo + pivot, is_less)
#[cfg(not(feature = "optimize_for_size"))]
{
partition(v, lo + pivot, is_less)
}

#[cfg(feature = "optimize_for_size")]
{
partition_size_opt(v, lo + pivot, is_less)
}
}

/// Moves around the 9 elements at the indices a..i, such that
Expand Down Expand Up @@ -298,3 +328,92 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
}
b
}

// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
// would clutter that module with cfg statements and make it generally harder to read and develop.
// So to decouple things and simplify it, we use a an even smaller bubble sort.
#[cfg(feature = "optimize_for_size")]
fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
let mut n = v.len();
let mut did_swap = true;

while did_swap && n > 1 {
did_swap = false;
for i in 1..n {
// SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
unsafe {
if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
v.swap_unchecked(i - 1, i);
did_swap = true;
}
}
}
n -= 1;
}
}

#[cfg(feature = "optimize_for_size")]
fn partition_size_opt<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
where
F: FnMut(&T, &T) -> bool,
{
let len = v.len();

// Allows for panic-free code-gen by proving this property to the compiler.
if len == 0 {
return 0;
}

if pivot >= len {
intrinsics::abort();
}

// SAFETY: We checked that `pivot` is in-bounds.
unsafe {
// Place the pivot at the beginning of slice.
v.swap_unchecked(0, pivot);
}
let (pivot, v_without_pivot) = v.split_at_mut(1);

// Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
// signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
// Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
// into a stack value, but this creates issues for types with interior mutability mandating
// a drop guard.
let pivot = &mut pivot[0];

let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less);

if num_lt >= len {
intrinsics::abort();
}

// SAFETY: We checked that `num_lt` is in-bounds.
unsafe {
// Place the pivot between the two partitions.
v.swap_unchecked(0, num_lt);
}

num_lt
}

#[cfg(feature = "optimize_for_size")]
fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
pivot: &T,
is_less: &mut F,
) -> usize {
let mut left = 0;

for right in 0..v.len() {
// SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
// left <= right and that both are in-bounds.
unsafe {
let right_is_lt = is_less(v.get_unchecked(right), pivot);
v.swap_unchecked(left, right);
left += right_is_lt as usize;
}
}

left
}
2 changes: 1 addition & 1 deletion core/src/slice/sort/shared/smallsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ where

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
Expand Down
58 changes: 45 additions & 13 deletions core/src/slice/sort/stable/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
//! This module contains the entry points for `slice::sort`.
#[cfg(not(feature = "optimize_for_size"))]
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::{
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
};
use crate::{cmp, intrinsics};

pub(crate) mod drift;
pub(crate) mod merge;

#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod drift;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod quicksort;

#[cfg(feature = "optimize_for_size")]
pub(crate) mod tiny;

/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// </~https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
Expand All @@ -30,25 +39,48 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
#[cfg(not(feature = "optimize_for_size"))]
{
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

driftsort_main::<T, F, BufT>(v, is_less);
}

driftsort_main::<T, F, BufT>(v, is_less);
#[cfg(feature = "optimize_for_size")]
{
let alloc_len = len / 2;

// For small inputs 4KiB of stack storage suffices, which allows us to avoid
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
let mut stack_buf = AlignedStorage::<T, 4096>::new();
let stack_scratch = stack_buf.as_uninit_slice_mut();
let mut heap_buf;
let scratch = if stack_scratch.len() >= alloc_len {
stack_scratch
} else {
heap_buf = BufT::with_capacity(alloc_len);
heap_buf.as_uninit_slice_mut()
};

tiny::mergesort(v, scratch, is_less);
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
// By allocating n elements of memory we can ensure the entire input can
Expand Down
75 changes: 75 additions & 0 deletions core/src/slice/sort/stable/tiny.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//! Binary-size optimized mergesort inspired by /~https://github.com/voultapher/tiny-sort-rs.
use crate::mem::{ManuallyDrop, MaybeUninit};
use crate::ptr;
use crate::slice::sort::stable::merge;

/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
/// no run detection, etc.
#[inline(always)]
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
let len = v.len();

if len > 2 {
let mid = len / 2;

// SAFETY: mid is in-bounds.
unsafe {
// Sort the left half recursively.
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
// Sort the right half recursively.
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
}

merge::merge(v, scratch, mid, is_less);
} else if len == 2 {
// Branchless swap the two elements. This reduces the recursion depth and improves
// perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
// for ~50 bytes in the binary.

// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
unsafe {
swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
}
}
}

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
unsafe {
let v_a = v_base.add(a_pos);
let v_b = v_base.add(b_pos);

// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
// in a well defined state, without duplicates.

// Important to only swap if it is more and not if it is equal. is_less should return false for
// equal, so we don't swap.
let should_swap = is_less(&*v_b, &*v_a);

// This is a branchless version of swap if.
// The equivalent code with a branch would be:
//
// if should_swap {
// ptr::swap(left, right, 1);
// }

// The goal is to generate cmov instructions here.
let left_swap = if should_swap { v_b } else { v_a };
let right_swap = if should_swap { v_a } else { v_b };

let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
ptr::copy(left_swap, v_a, 1);
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
}
}
Loading

0 comments on commit 1805c29

Please sign in to comment.