Skip to content

Redo the swap code for better tail & padding handling #134954

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
@@ -498,6 +498,23 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
}
}

sym::untyped_swap_nonoverlapping => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a good reason for why this is handled in a different way than typed_swap_nonoverlapping, and in a completely different place as well?

// The fallback impl uses memcpy, which leaves around allocas
// that don't optimize out for certain widths, so force it to
// use SSA registers instead.

let chunk_ty = fn_args.type_at(0);
let layout = self.layout_of(chunk_ty).layout;
let integer_ty = self.type_ix(layout.size().bits());
let a = args[0].immediate();
let b = args[1].immediate();
let a_val = self.load(integer_ty, a, layout.align().abi);
let b_val = self.load(integer_ty, b, layout.align().abi);
self.store(b_val, a, layout.align().abi);
self.store(a_val, b, layout.align().abi);
return Ok(());
}

sym::compare_bytes => {
// Here we assume that the `memcmp` provided by the target is a NOP for size 0.
let cmp = self.call_intrinsic("memcmp", &[
6 changes: 6 additions & 0 deletions compiler/rustc_hir_analysis/src/check/intrinsic.rs
Original file line number Diff line number Diff line change
@@ -504,6 +504,12 @@ pub fn check_intrinsic_type(
sym::typed_swap_nonoverlapping => {
(1, 0, vec![Ty::new_mut_ptr(tcx, param(0)); 2], tcx.types.unit)
}
sym::untyped_swap_nonoverlapping => (
1,
0,
vec![Ty::new_mut_ptr(tcx, Ty::new_maybe_uninit(tcx, param(0))); 2],
tcx.types.unit,
),

sym::discriminant_value => {
let assoc_items = tcx.associated_item_def_ids(
1 change: 1 addition & 0 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
@@ -2142,6 +2142,7 @@ symbols! {
unstable location; did you mean to load this crate \
from crates.io via `Cargo.toml` instead?",
untagged_unions,
untyped_swap_nonoverlapping,
unused_imports,
unwind,
unwind_attributes,
34 changes: 32 additions & 2 deletions library/core/src/intrinsics/mod.rs
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@

use crate::marker::{DiscriminantKind, Tuple};
use crate::mem::SizedTypeProperties;
use crate::{ptr, ub_checks};
use crate::{mem, ptr, ub_checks};

pub mod fallback;
pub mod mir;
@@ -4003,7 +4003,37 @@ pub use typed_swap as typed_swap_nonoverlapping;
pub const unsafe fn typed_swap_nonoverlapping<T>(x: *mut T, y: *mut T) {
// SAFETY: The caller provided single non-overlapping items behind
// pointers, so swapping them with `count: 1` is fine.
unsafe { ptr::swap_nonoverlapping(x, y, 1) };
unsafe { crate::swapping::swap_nonoverlapping(x, y, 1) };
}

/// Swaps the `N` untyped & non-overlapping bytes behind the two pointers.
///
/// Split out from `typed_swap` for the internal swaps in `swap_nonoverlapping`
/// which would otherwise cause cycles between the fallback implementations on
/// backends where neither is overridden.
///
/// # Safety
///
/// `x` and `y` are readable and writable as `MaybeUninit<C>` and non-overlapping.
#[inline]
#[rustc_nounwind]
#[cfg_attr(not(bootstrap), rustc_intrinsic)]
#[miri::intrinsic_fallback_is_spec]
#[rustc_const_stable_indirect]
pub const unsafe fn untyped_swap_nonoverlapping<C>(
x: *mut mem::MaybeUninit<C>,
y: *mut mem::MaybeUninit<C>,
) {
// This intentionally uses untyped memory copies, not reads/writes,
// to avoid any risk of losing padding in things like (u16, u8).
let mut temp = mem::MaybeUninit::<C>::uninit();
// SAFETY: Caller promised that x and y are non-overlapping & read/writeable,
// and our fresh local is always disjoint from anything otherwise readable.
unsafe {
(&raw mut temp).copy_from_nonoverlapping(x, 1);
x.copy_from_nonoverlapping(y, 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
x.copy_from_nonoverlapping(y, 1);
// Important for miri::intrinsic_fallback_is_spec: here we get language UB
// if x and y overlap.
x.copy_from_nonoverlapping(y, 1);

y.copy_from_nonoverlapping(&raw const temp, 1);
}
}

/// Returns whether we should perform some UB-checking at runtime. This eventually evaluates to
1 change: 1 addition & 0 deletions library/core/src/lib.rs
Original file line number Diff line number Diff line change
@@ -376,6 +376,7 @@ pub mod alloc;
// note: does not need to be public
mod bool;
mod escape;
pub(crate) mod swapping;
mod tuple;
mod unit;

81 changes: 2 additions & 79 deletions library/core/src/ptr/mod.rs
Original file line number Diff line number Diff line change
@@ -395,7 +395,6 @@
#![allow(clippy::not_unsafe_ptr_arg_deref)]

use crate::cmp::Ordering;
use crate::intrinsics::const_eval_select;
use crate::marker::FnPtr;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
use crate::{fmt, hash, intrinsics, ub_checks};
@@ -1092,84 +1091,8 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
}
);

const_eval_select!(
@capture[T] { x: *mut T, y: *mut T, count: usize }:
if const {
// At compile-time we want to always copy this in chunks of `T`, to ensure that if there
// are pointers inside `T` we will copy them in one go rather than trying to copy a part
// of a pointer (which would not work).
// SAFETY: Same preconditions as this function
unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
} else {
macro_rules! attempt_swap_as_chunks {
($ChunkTy:ty) => {
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
&& mem::size_of::<T>() % mem::size_of::<$ChunkTy>() == 0
{
let x: *mut $ChunkTy = x.cast();
let y: *mut $ChunkTy = y.cast();
let count = count * (mem::size_of::<T>() / mem::size_of::<$ChunkTy>());
// SAFETY: these are the same bytes that the caller promised were
// ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
// The `if` condition above ensures that we're not violating
// alignment requirements, and that the division is exact so
// that we don't lose any bytes off the end.
return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
}
};
}

// Split up the slice into small power-of-two-sized chunks that LLVM is able
// to vectorize (unless it's a special type with more-than-pointer alignment,
// because we don't want to pessimize things like slices of SIMD vectors.)
if mem::align_of::<T>() <= mem::size_of::<usize>()
&& (!mem::size_of::<T>().is_power_of_two()
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
{
attempt_swap_as_chunks!(usize);
attempt_swap_as_chunks!(u8);
}

// SAFETY: Same preconditions as this function
unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
}
)
}

/// Same behavior and safety conditions as [`swap_nonoverlapping`]
///
/// LLVM can vectorize this (at least it can for the power-of-two-sized types
/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
#[inline]
const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
let x = x.cast::<MaybeUninit<T>>();
let y = y.cast::<MaybeUninit<T>>();
let mut i = 0;
while i < count {
// SAFETY: By precondition, `i` is in-bounds because it's below `n`
let x = unsafe { x.add(i) };
// SAFETY: By precondition, `i` is in-bounds because it's below `n`
// and it's distinct from `x` since the ranges are non-overlapping
let y = unsafe { y.add(i) };

// If we end up here, it's because we're using a simple type -- like
// a small power-of-two-sized thing -- or a special type with particularly
// large alignment, particularly SIMD types.
// Thus, we're fine just reading-and-writing it, as either it's small
// and that works well anyway or it's special and the type's author
// presumably wanted things to be done in the larger chunk.

// SAFETY: we're only ever given pointers that are valid to read/write,
// including being aligned, and nothing here panics so it's drop-safe.
unsafe {
let a: MaybeUninit<T> = read(x);
let b: MaybeUninit<T> = read(y);
write(x, b);
write(y, a);
}

i += 1;
}
// SAFETY: Same preconditions as this function
unsafe { crate::swapping::swap_nonoverlapping(x, y, count) }
}

/// Moves `src` into the pointed `dst`, returning the previous `dst` value.
182 changes: 182 additions & 0 deletions library/core/src/swapping.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
use crate::{hint, intrinsics, mem, ptr};

//#[rustc_const_stable_indirect]
//#[rustc_allow_const_fn_unstable(const_eval_select)]
#[rustc_const_unstable(feature = "const_swap_nonoverlapping", issue = "133668")]
#[inline]
pub(crate) const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment clarifying whether this is typed or untyped.

intrinsics::const_eval_select!(
@capture[T] { x: *mut T, y: *mut T, count: usize }:
if const {
// At compile-time we want to always copy this in chunks of `T`, to ensure that if there
// are pointers inside `T` we will copy them in one go rather than trying to copy a part
// of a pointer (which would not work).
// SAFETY: Same preconditions as this function
unsafe { swap_nonoverlapping_const(x, y, count) }
} else {
// At runtime we want to make sure not to swap byte-for-byte for types like [u8; 15],
// and swapping as `MaybeUninit<T>` doesn't actually work as untyped for things like
// T = (u16, u8), so we type-erase to raw bytes and swap that way.
// SAFETY: Same preconditions as this function
unsafe { swap_nonoverlapping_runtime(x, y, count) }
}
)
}

/// Same behavior and safety conditions as [`swap_nonoverlapping`]
#[rustc_const_stable_indirect]
#[inline]
const unsafe fn swap_nonoverlapping_const<T>(x: *mut T, y: *mut T, count: usize) {
let x = x.cast::<mem::MaybeUninit<T>>();
let y = y.cast::<mem::MaybeUninit<T>>();
let mut i = 0;
while i < count {
// SAFETY: By precondition, `i` is in-bounds because it's below `n`
// and because the two input ranges are non-overlapping and read/writeable,
// these individual items inside them are too.
unsafe {
intrinsics::untyped_swap_nonoverlapping::<T>(x.add(i), y.add(i));
}

i += 1;
}
}

// Scale the monomorphizations with the size of the machine, roughly.
const MAX_ALIGN: usize = align_of::<usize>().pow(2);

/// Same behavior and safety conditions as [`swap_nonoverlapping`]
#[inline]
unsafe fn swap_nonoverlapping_runtime<T>(x: *mut T, y: *mut T, count: usize) {
let bytes = {
let slice = ptr::slice_from_raw_parts(x, count);
// SAFETY: Because they both exist in memory and don't overlap, they
// must be legal slice sizes (below `isize::MAX` bytes).
unsafe { mem::size_of_val_raw(slice) }
};

// Generating *untyped* loops for every type is silly, so we polymorphize away
// the actual type, but we want to take advantage of alignment if possible,
// so monomorphize for a restricted set of possible alignments.
macro_rules! delegate_by_alignment {
($($p:pat => $align:expr,)+) => {{
#![allow(unreachable_patterns)]
match const { align_of::<T>() } {
$(
$p => {
swap_nonoverlapping_bytes::<$align>(x.cast(), y.cast(), bytes);
}
)+
}
}};
}

// SAFETY:
unsafe {
delegate_by_alignment! {
MAX_ALIGN.. => MAX_ALIGN,
64.. => 64,
32.. => 32,
Comment on lines +77 to +79
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On 32bit machines, the 64.. and 32.. arms are dead code. Is that deliberate? If so, that should be explained in comments.

16.. => 16,
8.. => 8,
4.. => 4,
2.. => 2,
_ => 1,
}
}
}

/// # Safety:
/// - `x` and `y` must be aligned to `ALIGN`
/// - `bytes` must be a multiple of `ALIGN`
/// - They must be readable, writable, and non-overlapping for `bytes` bytes
#[inline]
unsafe fn swap_nonoverlapping_bytes<const ALIGN: usize>(
x: *mut mem::MaybeUninit<u8>,
y: *mut mem::MaybeUninit<u8>,
bytes: usize,
) {
// SAFETY: Two legal non-overlapping regions can't be bigger than this.
// (And they couldn't have made allocations any bigger either anyway.)
// FIXME: Would be nice to have a type for this instead of the assume.
unsafe { hint::assert_unchecked(bytes < isize::MAX as usize) };

let mut i = 0;
macro_rules! swap_next_n {
($n:expr) => {{
let x: *mut mem::MaybeUninit<[u8; $n]> = x.add(i).cast();
let y: *mut mem::MaybeUninit<[u8; $n]> = y.add(i).cast();
swap_nonoverlapping_aligned_chunk::<ALIGN, [u8; $n]>(
x.as_mut_unchecked(),
y.as_mut_unchecked(),
);
i += $n;
}};
}

while bytes - i >= MAX_ALIGN {
const { assert!(MAX_ALIGN >= ALIGN) };
// SAFETY: the const-assert above confirms we're only ever called with
// an alignment equal to or smaller than max align, so this is necessarily
// aligned, and the while loop ensures there's enough read/write memory.
unsafe {
swap_next_n!(MAX_ALIGN);
}
}

macro_rules! handle_tail {
($($n:literal)+) => {$(
if const { $n % ALIGN == 0 } {
// Checking this way simplifies the block end to just add+test,
// rather than needing extra math before the check.
if (bytes & $n) != 0 {
// SAFETY: The above swaps were bigger, so could not have
// impacted the `$n`-relevant bit, so checking `bytes & $n`
// was equivalent to `bytes - i >= $n`, and thus we have
// enough space left to swap another `$n` bytes.
unsafe {
swap_next_n!($n);
}
}
}
)+};
}
const { assert!(MAX_ALIGN <= 64) };
handle_tail!(32 16 8 4 2 1);

debug_assert_eq!(i, bytes);
}

/// Swaps the `C` behind `x` and `y` as untyped memory
///
/// # Safety
///
/// Both `x` and `y` must be aligned to `ALIGN`, in addition to their normal alignment.
/// They must be readable and writeable for `sizeof(C)` bytes, as usual for `&mut`s.
///
/// (The actual instantiations are usually `C = [u8; _]`, so we get the alignment
/// information from the loads by `assume`ing the passed-in alignment.)
// Don't let MIR inline this, because we really want it to keep its noalias metadata
#[rustc_no_mir_inline]
#[inline]
unsafe fn swap_nonoverlapping_aligned_chunk<const ALIGN: usize, C>(
x: &mut mem::MaybeUninit<C>,
y: &mut mem::MaybeUninit<C>,
) {
assert!(size_of::<C>() % ALIGN == 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit odd to see a runtime assertion in code like this. Does this work in a const { ... } block?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's how I first had it, but no, it doesn't because calls to this are guarded with "runtime" checks on the specific ALIGN used -- aka it would need a const-eval-impacting const if for the caller.

I could change it to debug_assert or just remove it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right. Just adding a comment about this could already help.


let x = ptr::from_mut(x);
let y = ptr::from_mut(y);

// SAFETY: One of our preconditions.
unsafe {
hint::assert_unchecked(x.is_aligned_to(ALIGN));
hint::assert_unchecked(y.is_aligned_to(ALIGN));
Comment on lines +173 to +174
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these actually help? Seems worth commenting on that since experience shows that assert_unchecked can often hurt codegen.

}

// SAFETY: The memory is readable and writable because these were passed to
// us as mutable references, and the untyped swap doesn't need validity.
unsafe {
intrinsics::untyped_swap_nonoverlapping::<C>(x, y);
}
}
29 changes: 29 additions & 0 deletions library/core/tests/ptr.rs
Original file line number Diff line number Diff line change
@@ -992,3 +992,32 @@ fn test_ptr_metadata_in_const() {
assert_eq!(SLICE_META, 3);
assert_eq!(DYN_META.size_of(), 42);
}

// See <https://github.com/rust-lang/rust/issues/134713>
#[test]
fn test_ptr_swap_nonoverlapping_swaps_padding() {
#[repr(C)]
struct Foo(usize, u8);

let buf1: [usize; 2] = [1000, 2000];
let buf2: [usize; 2] = [3000, 4000];

// Foo and [usize; 2] have the same size and alignment,
// so swap_nonoverlapping should treat them the same
assert_eq!(size_of::<Foo>(), size_of::<[usize; 2]>());
assert_eq!(align_of::<Foo>(), align_of::<[usize; 2]>());

let mut b1 = buf1;
let mut b2 = buf2;
// Safety: b1 and b2 are distinct local variables,
// with the same size and alignment as Foo.
unsafe {
std::ptr::swap_nonoverlapping(
b1.as_mut_ptr().cast::<Foo>(),
b2.as_mut_ptr().cast::<Foo>(),
1,
);
}
assert_eq!(b1, buf2);
assert_eq!(b2, buf1);
}
32 changes: 32 additions & 0 deletions tests/assembly/x86_64-typed-swap.rs
Original file line number Diff line number Diff line change
@@ -51,3 +51,35 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
// CHECK: retq
swap(x, y)
}

// CHECK-LABEL: swap_string:
#[no_mangle]
pub fn swap_string(x: &mut String, y: &mut String) {
// CHECK: movups (%[[ARG1]]), %[[T1a:xmm.]]
// CHECK: movups (%[[ARG2]]), %[[T2a:xmm.]]
// CHECK: movups %[[T2a]], (%[[ARG1]])
// CHECK: movups %[[T1a]], (%[[ARG2]])
// CHECK: movq 16(%[[ARG1]]), %[[T1b:r.+]]
// CHECK: movq 16(%[[ARG2]]), %[[T2b:r.+]]
// CHECK: movq %[[T2b]], 16(%[[ARG1]])
// CHECK: movq %[[T1b]], 16(%[[ARG2]])
// CHECK: retq
swap(x, y)
}

// CHECK-LABEL: swap_44_bytes:
#[no_mangle]
pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
// Ensure we do better than a long run of byte copies,
// see <https://github.com/rust-lang/rust/issues/134946>

// CHECK-NOT: movb
// CHECK-COUNT-8: movups{{.+}}xmm
// CHECK-NOT: movb
// CHECK-COUNT-4: movq
// CHECK-NOT: movb
// CHECK-COUNT-4: movl
// CHECK-NOT: movb
// CHECK: retq
swap(x, y)
}
30 changes: 26 additions & 4 deletions tests/codegen/simd/swap-simd-types.rs
Original file line number Diff line number Diff line change
@@ -23,8 +23,19 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
#[no_mangle]
pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
// CHECK-NOT: alloca
// CHECK: load <8 x float>{{.+}}align 32
// CHECK: store <8 x float>{{.+}}align 32

// CHECK-NOT: load i128
// CHECK-NOT: load i64
// CHECK-NOT: load i32

// CHECK: [[A:%.+]] = load i256{{.+}}align 32
// CHECK: [[B:%.+]] = load i256{{.+}}align 32
// CHECK: store i256 [[B]]{{.+}}align 32
// CHECK: store i256 [[A]]{{.+}}align 32

// CHECK-NOT: load i128
// CHECK-NOT: load i64
// CHECK-NOT: load i32
if x.len() == y.len() {
x.swap_with_slice(y);
}
@@ -34,7 +45,18 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
#[no_mangle]
pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
// CHECK-NOT: alloca
// CHECK: load <32 x i8>{{.+}}align 1
// CHECK: store <32 x i8>{{.+}}align 1

// CHECK-NOT: load i128
// CHECK-NOT: load i64
// CHECK-NOT: load i32

// CHECK: [[A:%.+]] = load i256{{.+}}align 1
// CHECK: [[B:%.+]] = load i256{{.+}}align 1
// CHECK: store i256 [[B]]{{.+}}align 1
// CHECK: store i256 [[A]]{{.+}}align 1

// CHECK-NOT: load i128
// CHECK-NOT: load i64
// CHECK-NOT: load i32
swap(x, y)
}
59 changes: 34 additions & 25 deletions tests/codegen/swap-large-types.rs
Original file line number Diff line number Diff line change
@@ -33,35 +33,42 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
#[no_mangle]
pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i64>
// CHECK: store <{{[0-9]+}} x i64>
// CHECK-COUNT-2: load i512{{.+}}align 8
// CHECK-COUNT-2: store i512{{.+}}align 8
// CHECK-COUNT-2: load i512{{.+}}align 8
// CHECK-COUNT-2: store i512{{.+}}align 8
// CHECK-COUNT-2: load i512{{.+}}align 8
// CHECK-COUNT-2: store i512{{.+}}align 8
// CHECK-COUNT-2: load i64{{.+}}align 8
// CHECK-COUNT-2: store i64{{.+}}align 8
swap(x, y)
}

// Verify that types with usize alignment are swapped via vectored usizes,
// not falling back to byte-level code.

// CHECK-LABEL: @swap_slice
#[no_mangle]
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i64>
// CHECK: store <{{[0-9]+}} x i64>
if x.len() == y.len() {
x.swap_with_slice(y);
}
}

// But for a large align-1 type, vectorized byte copying is what we want.

type OneKilobyteBuffer = [u8; 1024];

// CHECK-LABEL: @swap_1kb_slices
#[no_mangle]
pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i8>
// CHECK: store <{{[0-9]+}} x i8>

// These are so big that there's only the biggest chunk size used

// CHECK-NOT: load i256
// CHECK-NOT: load i128
// CHECK-NOT: load i64
// CHECK-NOT: load i32
// CHECK-NOT: load i16
// CHECK-NOT: load i8

// CHECK-COUNT-2: load i512{{.+}}align 1
// CHECK-COUNT-2: store i512{{.+}}align 1

// CHECK-NOT: store i256
// CHECK-NOT: store i128
// CHECK-NOT: store i64
// CHECK-NOT: store i32
// CHECK-NOT: store i16
// CHECK-NOT: store i8
if x.len() == y.len() {
x.swap_with_slice(y);
}
@@ -81,10 +88,12 @@ pub struct BigButHighlyAligned([u8; 64 * 3]);
// CHECK-LABEL: @swap_big_aligned
#[no_mangle]
pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
// CHECK-NOT: call void @llvm.memcpy
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
// CHECK-NOT: call void @llvm.memcpy
// CHECK-NOT: alloca
// CHECK-COUNT-2: load i512{{.+}}align 64
// CHECK-COUNT-2: store i512{{.+}}align 64
// CHECK-COUNT-2: load i512{{.+}}align 64
// CHECK-COUNT-2: store i512{{.+}}align 64
// CHECK-COUNT-2: load i512{{.+}}align 64
// CHECK-COUNT-2: store i512{{.+}}align 64
swap(x, y)
}
106 changes: 81 additions & 25 deletions tests/codegen/swap-small-types.rs
Original file line number Diff line number Diff line change
@@ -27,13 +27,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
// CHECK-NOT: alloca

// Whether `i8` is the best for this is unclear, but
// might as well record what's actually happening right now.

// CHECK: load i8
// CHECK: load i8
// CHECK: store i8
// CHECK: store i8
// Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
// and is closer to the assembly it generates anyway.

// CHECK-NOT: load
// CHECK: load i32{{.+}}align 2
// CHECK-NEXT: load i32{{.+}}align 2
// CHECK-NEXT: store i32{{.+}}align 2
// CHECK-NEXT: store i32{{.+}}align 2
// CHECK: load i16{{.+}}align 2
// CHECK-NEXT: load i16{{.+}}align 2
// CHECK-NEXT: store i16{{.+}}align 2
// CHECK-NEXT: store i16{{.+}}align 2
// CHECK-NOT: store
swap(x, y)
}

@@ -54,19 +60,27 @@ pub fn swap_rgba64(x: &mut RGBA64, y: &mut RGBA64) {
#[no_mangle]
pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {
// CHECK-NOT: alloca
// There are plenty more loads and stores than just these,
// but at least one sure better be 64-bit (for size or capacity).
// CHECK: load i64

// CHECK-NOT: load
// CHECK: load i128
// CHECK-NEXT: load i128
// CHECK-NEXT: store i128
// CHECK-NEXT: store i128
// CHECK: load i64
// CHECK: store i64
// CHECK: store i64
// CHECK: ret void
// CHECK-NEXT: load i64
// CHECK-NEXT: store i64
// CHECK-NEXT: store i64
// CHECK-NOT: store
swap(x, y)
}

// CHECK-LABEL: @swap_slices
#[no_mangle]
pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
// Note that separate loads here is fine, as they merge to `movups` anyway
// at the assembly level, so staying more obviously typed and as a scalar
// pair -- like they're used elsewhere -- is ok, no need to force `i128`.

// CHECK-NOT: alloca
// CHECK: load ptr
// CHECK: load i64
@@ -76,45 +90,84 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
swap(x, y)
}

// LLVM doesn't vectorize a loop over 3-byte elements,
// so we chunk it down to bytes and loop over those instead.
type RGB24 = [u8; 3];

// CHECK-LABEL: @swap_rgb24_slices
#[no_mangle]
pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i8>
// CHECK: store <{{[0-9]+}} x i8>

// The odd size means we need the full set.

// CHECK-COUNT-2: load i512{{.+}}align 1
// CHECK-NEXT: store i512{{.+}}align 1
// CHECK-COUNT-2: load i256{{.+}}align 1
// CHECK-NEXT: store i256{{.+}}align 1
// CHECK-COUNT-2: load i128{{.+}}align 1
// CHECK-NEXT: store i128{{.+}}align 1
// CHECK-COUNT-2: load i64{{.+}}align 1
// CHECK-NEXT: store i64{{.+}}align 1
// CHECK-COUNT-2: load i32{{.+}}align 1
// CHECK-NEXT: store i32{{.+}}align 1
// CHECK-COUNT-2: load i16{{.+}}align 1
// CHECK-NEXT: store i16{{.+}}align 1
// CHECK-COUNT-2: load i8{{.+}}align 1
// CHECK-NEXT: store i8{{.+}}align 1
if x.len() == y.len() {
x.swap_with_slice(y);
}
}

// This one has a power-of-two size, so we iterate over it directly
type RGBA32 = [u8; 4];

// CHECK-LABEL: @swap_rgba32_slices
#[no_mangle]
pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i32>
// CHECK: store <{{[0-9]+}} x i32>

// Because the size in bytes in a multiple of 4, we can skip the smallest sizes.

// CHECK-COUNT-2: load i512{{.+}}align 1
// CHECK-NEXT: store i512{{.+}}align 1
// CHECK-COUNT-2: load i256{{.+}}align 1
// CHECK-NEXT: store i256{{.+}}align 1
// CHECK-COUNT-2: load i128{{.+}}align 1
// CHECK-NEXT: store i128{{.+}}align 1
// CHECK-COUNT-2: load i64{{.+}}align 1
// CHECK-NEXT: store i64{{.+}}align 1
// CHECK-COUNT-2: load i32{{.+}}align 1
// CHECK-NEXT: store i32{{.+}}align 1
// CHECK-NOT: load i16
// CHECK-NOT: store i16
// CHECK-NOT: load i8
// CHECK-NOT: store i8
if x.len() == y.len() {
x.swap_with_slice(y);
}
}

// Strings have a non-power-of-two size, but have pointer alignment,
// so we swap usizes instead of dropping all the way down to bytes.
// Strings have a non-power-of-two size, but have pointer alignment.
const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());

// CHECK-LABEL: @swap_string_slices
#[no_mangle]
pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i64>
// CHECK: store <{{[0-9]+}} x i64>

// CHECK-COUNT-2: load i512{{.+}}align 8
// CHECK-NEXT: store i512{{.+}}align 8
// CHECK-COUNT-2: load i256{{.+}}align 8
// CHECK-NEXT: store i256{{.+}}align 8
// CHECK-COUNT-2: load i128{{.+}}align 8
// CHECK-NEXT: store i128{{.+}}align 8
// CHECK-COUNT-2: load i64{{.+}}align 8
// CHECK-NEXT: store i64{{.+}}align 8
// CHECK-NOT: load i32
// CHECK-NOT: store i32
// CHECK-NOT: load i16
// CHECK-NOT: store i16
// CHECK-NOT: load i8
// CHECK-NOT: store i8
if x.len() == y.len() {
x.swap_with_slice(y);
}
@@ -130,6 +183,9 @@ pub struct Packed {
#[no_mangle]
pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
// CHECK-NOT: alloca
// CHECK: ret void
// CHECK-COUNT-2: load i64{{.+}}align 1
// CHECK-COUNT-2: store i64{{.+}}align 1
// CHECK-COUNT-2: load i8{{.+}}align 1
// CHECK-COUNT-2: store i8{{.+}}align 1
swap(x, y)
}