Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use REP MOVSB/STOSB when the ERMSB feature is present #392

Merged
merged 3 commits into from
Nov 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/mem/impls.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
use super::c_int;

#[inline(always)]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}

#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
}

#[inline(always)]
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c;
i += 1;
}
}
41 changes: 0 additions & 41 deletions src/mem/memcpy.rs

This file was deleted.

28 changes: 26 additions & 2 deletions src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};

// memcpy/memmove/memset have optimized implementations on some architectures
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
mod memcpy;
pub use self::memcpy::*;
mod impls;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
impls::copy_forward(dest, src, n);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= n {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
impls::copy_forward(dest, src, n);
} else {
impls::copy_backward(dest, src, n);
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
impls::set_bytes(s, c as u8, n);
s
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
Expand Down
58 changes: 37 additions & 21 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use super::c_int;

// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
Expand All @@ -13,11 +11,26 @@ use super::c_int;
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
// However, to avoid run-time feature detection, we don't use these byte-based
// instructions for most of the copying, preferring the qword variants.
//
// To simplify things, we switch to using the byte-based variants if the "ermsb"
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
asm!(
"rep movsb [rdi], [rsi]",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src => _,
options(nostack, preserves_flags)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= count {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
return self::memcpy(dest, src, count);
}
// copy backwards
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
options(nostack)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
asm!(
"rep stosb [rdi], al",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("al") c => _,
options(nostack, preserves_flags)
)
}

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
in("rax") (c as u8 as u64) * 0x0101010101010101,
in("rax") (c as u64) * 0x0101010101010101,
options(nostack, preserves_flags)
);
dest
}
80 changes: 56 additions & 24 deletions testcrate/benches/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,45 @@ use test::{black_box, Bencher};
extern crate compiler_builtins;
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};

fn memcpy_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
dst.copy_from_slice(src);
})
}

fn memcpy_rust(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
})
}

fn memset_builtin(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val: u8 = black_box(27);
for b in dst {
*b = val;
}
})
}

fn memset_rust(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val = black_box(27);
unsafe { memset(dst.as_mut_ptr(), val, n) }
})
Expand Down Expand Up @@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {

#[bench]
fn memcpy_builtin_4096(b: &mut Bencher) {
memcpy_builtin(b, 4096)
memcpy_builtin(b, 4096, 0)
}
#[bench]
fn memcpy_rust_4096(b: &mut Bencher) {
memcpy_rust(b, 4096)
memcpy_rust(b, 4096, 0)
}
#[bench]
fn memcpy_builtin_1048576(b: &mut Bencher) {
memcpy_builtin(b, 1048576)
memcpy_builtin(b, 1048576, 0)
}
#[bench]
fn memcpy_rust_1048576(b: &mut Bencher) {
memcpy_rust(b, 1048576)
memcpy_rust(b, 1048576, 0)
}
#[bench]
fn memcpy_builtin_4096_offset(b: &mut Bencher) {
memcpy_builtin(b, 4096, 65)
}
#[bench]
fn memcpy_rust_4096_offset(b: &mut Bencher) {
memcpy_rust(b, 4096, 65)
}
#[bench]
fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
memcpy_builtin(b, 1048576, 65)
}
#[bench]
fn memcpy_rust_1048576_offset(b: &mut Bencher) {
memcpy_rust(b, 1048576, 65)
}

#[bench]
fn memset_builtin_4096(b: &mut Bencher) {
memset_builtin(b, 4096)
memset_builtin(b, 4096, 0)
}
#[bench]
fn memset_rust_4096(b: &mut Bencher) {
memset_rust(b, 4096)
memset_rust(b, 4096, 0)
}
#[bench]
fn memset_builtin_1048576(b: &mut Bencher) {
memset_builtin(b, 1048576)
memset_builtin(b, 1048576, 0)
}
#[bench]
fn memset_rust_1048576(b: &mut Bencher) {
memset_rust(b, 1048576)
memset_rust(b, 1048576, 0)
}
#[bench]
fn memset_builtin_4096_offset(b: &mut Bencher) {
memset_builtin(b, 4096, 65)
}
#[bench]
fn memset_rust_4096_offset(b: &mut Bencher) {
memset_rust(b, 4096, 65)
}
#[bench]
fn memset_builtin_1048576_offset(b: &mut Bencher) {
memset_builtin(b, 1048576, 65)
}
#[bench]
fn memset_rust_1048576_offset(b: &mut Bencher) {
memset_rust(b, 1048576, 65)
}

#[bench]
Expand Down