From 345cb72bef7dd19c767eb79395ecaf5f2dbdddd8 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Fri, 23 Oct 2020 04:27:00 -0700 Subject: [PATCH 1/3] Reorganize mem functions This reduces the amount of platform-specific code Signed-off-by: Joe Richey --- src/mem/impls.rs | 29 +++++++++++++++++++++++++++++ src/mem/memcpy.rs | 41 ----------------------------------------- src/mem/mod.rs | 28 ++++++++++++++++++++++++++-- src/mem/x86_64.rs | 26 +++++++------------------- 4 files changed, 62 insertions(+), 62 deletions(-) create mode 100644 src/mem/impls.rs delete mode 100644 src/mem/memcpy.rs diff --git a/src/mem/impls.rs b/src/mem/impls.rs new file mode 100644 index 00000000..6bd1a7ba --- /dev/null +++ b/src/mem/impls.rs @@ -0,0 +1,29 @@ +use super::c_int; + +#[inline(always)] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) { + let mut i = 0; + while i < n { + *dest.offset(i as isize) = *src.offset(i as isize); + i += 1; + } +} + +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) { + // copy from end + let mut i = n; + while i != 0 { + i -= 1; + *dest.offset(i as isize) = *src.offset(i as isize); + } +} + +#[inline(always)] +pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) { + let mut i = 0; + while i < n { + *s.offset(i as isize) = c; + i += 1; + } +} diff --git a/src/mem/memcpy.rs b/src/mem/memcpy.rs deleted file mode 100644 index 8fada9bc..00000000 --- a/src/mem/memcpy.rs +++ /dev/null @@ -1,41 +0,0 @@ -use super::c_int; - -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { - let mut i = 0; - while i < n { - *dest.offset(i as isize) = *src.offset(i as isize); - i += 1; - } - dest -} - -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { - if src < dest as *const u8 { - // copy from end - let mut i = n; - while i != 0 { - i -= 1; - *dest.offset(i as isize) = *src.offset(i as isize); - } - } else { - // copy from beginning - let mut i = 0; - while i < n { - *dest.offset(i as isize) = *src.offset(i as isize); - i += 1; - } - } - dest -} - -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { - let mut i = 0; - while i < n { - *s.offset(i as isize) = c as u8; - i += 1; - } - s -} diff --git a/src/mem/mod.rs b/src/mem/mod.rs index aa9d4b61..6bc76337 100644 --- a/src/mem/mod.rs +++ b/src/mem/mod.rs @@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl}; // memcpy/memmove/memset have optimized implementations on some architectures #[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")] -mod memcpy; -pub use self::memcpy::*; +mod impls; + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + impls::copy_forward(dest, src, n); + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + let delta = (dest as usize).wrapping_sub(src as usize); + if delta >= n { + // We can copy forwards because either dest is far enough ahead of src, + // or src is ahead of dest (and delta overflowed). + impls::copy_forward(dest, src, n); + } else { + impls::copy_backward(dest, src, n); + } + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { + impls::set_bytes(s, c as u8, n); + s +} #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 1ecffce4..b94a09da 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -1,5 +1,3 @@ -use super::c_int; - // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have // been enhanced to perform better than an simple qword loop, making them ideal // for implementing memcpy/memset. Note that "rep cmps" has received no such @@ -16,8 +14,8 @@ use super::c_int; // However, to avoid run-time feature detection, we don't use these byte-based // instructions for most of the copying, preferring the qword variants. -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { +#[inline(always)] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { let qword_count = count >> 3; let byte_count = count & 0b111; asm!( @@ -30,18 +28,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> inout("rsi") src => _, options(nostack, preserves_flags) ); - dest } -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { - let delta = (dest as usize).wrapping_sub(src as usize); - if delta >= count { - // We can copy forwards because either dest is far enough ahead of src, - // or src is ahead of dest (and delta overflowed). - return self::memcpy(dest, src, count); - } - // copy backwards +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { let qword_count = count >> 3; let byte_count = count & 0b111; asm!( @@ -58,11 +48,10 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> inout("rsi") src.offset(count as isize).wrapping_sub(8) => _, options(nostack) ); - dest } -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 { +#[inline(always)] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { let qword_count = count >> 3; let byte_count = count & 0b111; asm!( @@ -72,8 +61,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u byte_count = in(reg) byte_count, inout("rcx") qword_count => _, inout("rdi") dest => _, - in("rax") (c as u8 as u64) * 0x0101010101010101, + in("rax") (c as u64) * 0x0101010101010101, options(nostack, preserves_flags) ); - dest } From 69d259b2f6b2d56dc1cfde6d19a237479ae8c835 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Fri, 23 Oct 2020 04:40:38 -0700 Subject: [PATCH 2/3] Use ERMSB implementations if the feature is set Signed-off-by: Joe Richey --- src/mem/x86_64.rs | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index b94a09da..7eefd809 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -11,10 +11,25 @@ // - FSRM - Fast Short REP MOV (Ice Lake and later) // - Fast Zero-Length MOVSB (On no current hardware) // - Fast Short STOSB (On no current hardware) -// However, to avoid run-time feature detection, we don't use these byte-based -// instructions for most of the copying, preferring the qword variants. +// +// To simplify things, we switch to using the byte-based variants if the "ermsb" +// feature is present at compile-time. We don't bother detecting other features. +// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". #[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { + asm!( + "rep movsb [rdi], [rsi]", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(nostack, preserves_flags) + ); +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { let qword_count = count >> 3; let byte_count = count & 0b111; @@ -51,6 +66,19 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { } #[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { + asm!( + "rep stosb [rdi], al", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("al") c => _, + options(nostack, preserves_flags) + ) +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { let qword_count = count >> 3; let byte_count = count & 0b111; From 10c27e8353deb0d97cfda13301015d8d32cc08c8 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Tue, 3 Nov 2020 01:20:13 -0800 Subject: [PATCH 3/3] Add non-aligned benchmarks Signed-off-by: Joe Richey --- testcrate/benches/mem.rs | 80 ++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/testcrate/benches/mem.rs b/testcrate/benches/mem.rs index 57d57508..cee64ae4 100644 --- a/testcrate/benches/mem.rs +++ b/testcrate/benches/mem.rs @@ -6,33 +6,33 @@ use test::{black_box, Bencher}; extern crate compiler_builtins; use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; -fn memcpy_builtin(b: &mut Bencher, n: usize) { - let v1 = vec![1u8; n]; - let mut v2 = vec![0u8; n]; +fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) { + let v1 = vec![1u8; n + offset]; + let mut v2 = vec![0u8; n + offset]; b.bytes = n as u64; b.iter(|| { - let src: &[u8] = black_box(&v1); - let dst: &mut [u8] = black_box(&mut v2); + let src: &[u8] = black_box(&v1[offset..]); + let dst: &mut [u8] = black_box(&mut v2[offset..]); dst.copy_from_slice(src); }) } -fn memcpy_rust(b: &mut Bencher, n: usize) { - let v1 = vec![1u8; n]; - let mut v2 = vec![0u8; n]; +fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) { + let v1 = vec![1u8; n + offset]; + let mut v2 = vec![0u8; n + offset]; b.bytes = n as u64; b.iter(|| { - let src: &[u8] = black_box(&v1); - let dst: &mut [u8] = black_box(&mut v2); + let src: &[u8] = black_box(&v1[offset..]); + let dst: &mut [u8] = black_box(&mut v2[offset..]); unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } }) } -fn memset_builtin(b: &mut Bencher, n: usize) { - let mut v1 = vec![0u8; n]; +fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) { + let mut v1 = vec![0u8; n + offset]; b.bytes = n as u64; b.iter(|| { - let dst: &mut [u8] = black_box(&mut v1); + let dst: &mut [u8] = black_box(&mut v1[offset..]); let val: u8 = black_box(27); for b in dst { *b = val; @@ -40,11 +40,11 @@ fn memset_builtin(b: &mut Bencher, n: usize) { }) } -fn memset_rust(b: &mut Bencher, n: usize) { - let mut v1 = vec![0u8; n]; +fn memset_rust(b: &mut Bencher, n: usize, offset: usize) { + let mut v1 = vec![0u8; n + offset]; b.bytes = n as u64; b.iter(|| { - let dst: &mut [u8] = black_box(&mut v1); + let dst: &mut [u8] = black_box(&mut v1[offset..]); let val = black_box(27); unsafe { memset(dst.as_mut_ptr(), val, n) } }) @@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) { #[bench] fn memcpy_builtin_4096(b: &mut Bencher) { - memcpy_builtin(b, 4096) + memcpy_builtin(b, 4096, 0) } #[bench] fn memcpy_rust_4096(b: &mut Bencher) { - memcpy_rust(b, 4096) + memcpy_rust(b, 4096, 0) } #[bench] fn memcpy_builtin_1048576(b: &mut Bencher) { - memcpy_builtin(b, 1048576) + memcpy_builtin(b, 1048576, 0) } #[bench] fn memcpy_rust_1048576(b: &mut Bencher) { - memcpy_rust(b, 1048576) + memcpy_rust(b, 1048576, 0) +} +#[bench] +fn memcpy_builtin_4096_offset(b: &mut Bencher) { + memcpy_builtin(b, 4096, 65) +} +#[bench] +fn memcpy_rust_4096_offset(b: &mut Bencher) { + memcpy_rust(b, 4096, 65) +} +#[bench] +fn memcpy_builtin_1048576_offset(b: &mut Bencher) { + memcpy_builtin(b, 1048576, 65) +} +#[bench] +fn memcpy_rust_1048576_offset(b: &mut Bencher) { + memcpy_rust(b, 1048576, 65) } #[bench] fn memset_builtin_4096(b: &mut Bencher) { - memset_builtin(b, 4096) + memset_builtin(b, 4096, 0) } #[bench] fn memset_rust_4096(b: &mut Bencher) { - memset_rust(b, 4096) + memset_rust(b, 4096, 0) } #[bench] fn memset_builtin_1048576(b: &mut Bencher) { - memset_builtin(b, 1048576) + memset_builtin(b, 1048576, 0) } #[bench] fn memset_rust_1048576(b: &mut Bencher) { - memset_rust(b, 1048576) + memset_rust(b, 1048576, 0) +} +#[bench] +fn memset_builtin_4096_offset(b: &mut Bencher) { + memset_builtin(b, 4096, 65) +} +#[bench] +fn memset_rust_4096_offset(b: &mut Bencher) { + memset_rust(b, 4096, 65) +} +#[bench] +fn memset_builtin_1048576_offset(b: &mut Bencher) { + memset_builtin(b, 1048576, 65) +} +#[bench] +fn memset_rust_1048576_offset(b: &mut Bencher) { + memset_rust(b, 1048576, 65) } #[bench]