diff --git a/src/lib.rs b/src/lib.rs index e0b4ce3..16f9aa8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -179,3 +179,6 @@ mod memchr; pub mod memmem; #[cfg(test)] mod tests; +// SIMD is only supported on x86_64 currently. +#[cfg(target_arch = "x86_64")] +mod vector; diff --git a/src/memchr/x86/sse2.rs b/src/memchr/genericsimd.rs similarity index 52% rename from src/memchr/x86/sse2.rs rename to src/memchr/genericsimd.rs index b7b3a93..27b9ca3 100644 --- a/src/memchr/x86/sse2.rs +++ b/src/memchr/genericsimd.rs @@ -1,19 +1,20 @@ -use core::{arch::x86_64::*, cmp, mem::size_of}; +use crate::vector::Vector; +use core::cmp; -const VECTOR_SIZE: usize = size_of::<__m128i>(); -const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; +// The number of elements to loop at in one iteration of memchr/memrchr. +const LOOP_AMT: usize = 4; -// The number of bytes to loop at in one iteration of memchr/memrchr. -const LOOP_SIZE: usize = 4 * VECTOR_SIZE; - -// The number of bytes to loop at in one iteration of memchr2/memrchr2 and +// The number of elements to loop at in one iteration of memchr2/memrchr2 and // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes // in benchmarks. memchr3 in particular only gets a very slight speed up from // the loop unrolling. -const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; +const LOOP_AMT2: usize = 2; -#[target_feature(enable = "sse2")] -pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { +#[inline(always)] +pub(crate) unsafe fn memchr( + n1: u8, + haystack: &[u8], +) -> Option { // What follows is a fast SSE2-only algorithm to detect the position of // `n1` in `haystack` if it exists. From what I know, this is the "classic" // algorithm. I believe it can be found in places like glibc and Go's @@ -105,14 +106,14 @@ pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { // structure to what you see below, so this comment applies fairly well to // all of them. - let vn1 = _mm_set1_epi8(n1 as i8); + let vn1 = V::splat(n1); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); + let loop_size = cmp::min(V::size() * LOOP_AMT, len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = start_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr < end_ptr { if *ptr == n1 { return Some(sub(ptr, start_ptr)); @@ -126,77 +127,81 @@ pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { return Some(i); } - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); - let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); - let eqa = _mm_cmpeq_epi8(vn1, a); - let eqb = _mm_cmpeq_epi8(vn1, b); - let eqc = _mm_cmpeq_epi8(vn1, c); - let eqd = _mm_cmpeq_epi8(vn1, d); - let or1 = _mm_or_si128(eqa, eqb); - let or2 = _mm_or_si128(eqc, eqd); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { + ptr = ptr.add(V::size() - (start_ptr as usize & V::align_mask())); + debug_assert!(ptr > start_ptr && end_ptr.sub(V::size()) >= start_ptr); + while loop_size == V::size() * LOOP_AMT && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % V::size()); + + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let c = V::load_aligned(ptr.add(2 * V::size())); + let d = V::load_aligned(ptr.add(3 * V::size())); + let eqa = vn1.cmpeq(a); + let eqb = vn1.cmpeq(b); + let eqc = vn1.cmpeq(c); + let eqd = vn1.cmpeq(d); + let or1 = eqa.or(eqb); + let or2 = eqc.or(eqd); + let or3 = or1.or(or2); + if or3.movemask() != 0 { let mut at = sub(ptr, start_ptr); - let mask = _mm_movemask_epi8(eqa); + let mask = eqa.movemask(); if mask != 0 { return Some(at + forward_pos(mask)); } - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqb); + at += V::size(); + let mask = eqb.movemask(); if mask != 0 { return Some(at + forward_pos(mask)); } - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqc); + at += V::size(); + let mask = eqc.movemask(); if mask != 0 { return Some(at + forward_pos(mask)); } - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqd); + at += V::size(); + let mask = eqd.movemask(); debug_assert!(mask != 0); return Some(at + forward_pos(mask)); } ptr = ptr.add(loop_size); } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + while ptr <= end_ptr.sub(V::size()) { + debug_assert!(sub(end_ptr, ptr) >= V::size()); if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } - ptr = ptr.add(VECTOR_SIZE); + ptr = ptr.add(V::size()); } if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + debug_assert!(sub(end_ptr, ptr) < V::size()); + ptr = ptr.sub(V::size() - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), V::size()); return forward_search1(start_ptr, end_ptr, ptr, vn1); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); +#[inline(always)] +pub(crate) unsafe fn memchr2( + n1: u8, + n2: u8, + haystack: &[u8], +) -> Option { + let vn1 = V::splat(n1); + let vn2 = V::splat(n2); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); + let loop_size = cmp::min(LOOP_AMT2 * V::size(), len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = start_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 { return Some(sub(ptr, start_ptr)); @@ -210,68 +215,68 @@ pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { return Some(i); } - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { + ptr = ptr.add(V::size() - (start_ptr as usize & V::align_mask())); + debug_assert!(ptr > start_ptr && end_ptr.sub(V::size()) >= start_ptr); + while loop_size == LOOP_AMT2 * V::size() && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % V::size()); + + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let eqa1 = vn1.cmpeq(a); + let eqb1 = vn1.cmpeq(b); + let eqa2 = vn2.cmpeq(a); + let eqb2 = vn2.cmpeq(b); + let or1 = eqa1.or(eqb1); + let or2 = eqa2.or(eqb2); + let or3 = or1.or(or2); + if or3.movemask() != 0 { let mut at = sub(ptr, start_ptr); - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); + let mask1 = eqa1.movemask(); + let mask2 = eqa2.movemask(); if mask1 != 0 || mask2 != 0 { return Some(at + forward_pos2(mask1, mask2)); } - at += VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); + at += V::size(); + let mask1 = eqb1.movemask(); + let mask2 = eqb2.movemask(); return Some(at + forward_pos2(mask1, mask2)); } ptr = ptr.add(loop_size); } - while ptr <= end_ptr.sub(VECTOR_SIZE) { + while ptr <= end_ptr.sub(V::size()) { if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } - ptr = ptr.add(VECTOR_SIZE); + ptr = ptr.add(V::size()); } if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + debug_assert!(sub(end_ptr, ptr) < V::size()); + ptr = ptr.sub(V::size() - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), V::size()); return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn memchr3( +#[inline(always)] +pub(crate) unsafe fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let vn3 = _mm_set1_epi8(n3 as i8); + let vn1 = V::splat(n1); + let vn2 = V::splat(n2); + let vn3 = V::splat(n3); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); + let loop_size = cmp::min(LOOP_AMT2 * V::size(), len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = start_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); @@ -285,69 +290,72 @@ pub unsafe fn memchr3( return Some(i); } - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let eqa3 = _mm_cmpeq_epi8(vn3, a); - let eqb3 = _mm_cmpeq_epi8(vn3, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(eqa3, eqb3); - let or4 = _mm_or_si128(or1, or2); - let or5 = _mm_or_si128(or3, or4); - if _mm_movemask_epi8(or5) != 0 { + ptr = ptr.add(V::size() - (start_ptr as usize & V::align_mask())); + debug_assert!(ptr > start_ptr && end_ptr.sub(V::size()) >= start_ptr); + while loop_size == LOOP_AMT2 * V::size() && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % V::size()); + + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let eqa1 = vn1.cmpeq(a); + let eqb1 = vn1.cmpeq(b); + let eqa2 = vn2.cmpeq(a); + let eqb2 = vn2.cmpeq(b); + let eqa3 = vn3.cmpeq(a); + let eqb3 = vn3.cmpeq(b); + let or1 = eqa1.or(eqb1); + let or2 = eqa2.or(eqb2); + let or3 = eqa3.or(eqb3); + let or4 = or1.or(or2); + let or5 = or3.or(or4); + if or5.movemask() != 0 { let mut at = sub(ptr, start_ptr); - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - let mask3 = _mm_movemask_epi8(eqa3); + let mask1 = eqa1.movemask(); + let mask2 = eqa2.movemask(); + let mask3 = eqa3.movemask(); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + forward_pos3(mask1, mask2, mask3)); } - at += VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - let mask3 = _mm_movemask_epi8(eqb3); + at += V::size(); + let mask1 = eqb1.movemask(); + let mask2 = eqb2.movemask(); + let mask3 = eqb3.movemask(); return Some(at + forward_pos3(mask1, mask2, mask3)); } ptr = ptr.add(loop_size); } - while ptr <= end_ptr.sub(VECTOR_SIZE) { + while ptr <= end_ptr.sub(V::size()) { if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } - ptr = ptr.add(VECTOR_SIZE); + ptr = ptr.add(V::size()); } if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + debug_assert!(sub(end_ptr, ptr) < V::size()); + ptr = ptr.sub(V::size() - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), V::size()); return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { - let vn1 = _mm_set1_epi8(n1 as i8); +#[inline(always)] +pub(crate) unsafe fn memrchr( + n1: u8, + haystack: &[u8], +) -> Option { + let vn1 = V::splat(n1); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); + let loop_size = cmp::min(LOOP_AMT * V::size(), len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = end_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 { @@ -357,77 +365,82 @@ pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { return None; } - ptr = ptr.sub(VECTOR_SIZE); + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + ptr = (end_ptr as usize & !V::align_mask()) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + while loop_size == LOOP_AMT * V::size() && ptr >= start_ptr.add(loop_size) + { + debug_assert_eq!(0, (ptr as usize) % V::size()); ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); - let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); - let eqa = _mm_cmpeq_epi8(vn1, a); - let eqb = _mm_cmpeq_epi8(vn1, b); - let eqc = _mm_cmpeq_epi8(vn1, c); - let eqd = _mm_cmpeq_epi8(vn1, d); - let or1 = _mm_or_si128(eqa, eqb); - let or2 = _mm_or_si128(eqc, eqd); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); - let mask = _mm_movemask_epi8(eqd); + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let c = V::load_aligned(ptr.add(2 * V::size())); + let d = V::load_aligned(ptr.add(3 * V::size())); + let eqa = vn1.cmpeq(a); + let eqb = vn1.cmpeq(b); + let eqc = vn1.cmpeq(c); + let eqd = vn1.cmpeq(d); + let or1 = eqa.or(eqb); + let or2 = eqc.or(eqd); + let or3 = or1.or(or2); + if or3.movemask() != 0 { + let mut at = sub(ptr.add(3 * V::size()), start_ptr); + let mask = eqd.movemask(); if mask != 0 { - return Some(at + reverse_pos(mask)); + return Some(at + reverse_pos::(mask)); } - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqc); + at -= V::size(); + let mask = eqc.movemask(); if mask != 0 { - return Some(at + reverse_pos(mask)); + return Some(at + reverse_pos::(mask)); } - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqb); + at -= V::size(); + let mask = eqb.movemask(); if mask != 0 { - return Some(at + reverse_pos(mask)); + return Some(at + reverse_pos::(mask)); } - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqa); + at -= V::size(); + let mask = eqa.movemask(); debug_assert!(mask != 0); - return Some(at + reverse_pos(mask)); + return Some(at + reverse_pos::(mask)); } } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); + while ptr >= start_ptr.add(V::size()) { + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } } if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + debug_assert!(sub(ptr, start_ptr) < V::size()); return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); +#[inline(always)] +pub(crate) unsafe fn memrchr2( + n1: u8, + n2: u8, + haystack: &[u8], +) -> Option { + let vn1 = V::splat(n1); + let vn2 = V::splat(n2); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); + let loop_size = cmp::min(LOOP_AMT2 * V::size(), len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = end_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 { @@ -437,70 +450,71 @@ pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { return None; } - ptr = ptr.sub(VECTOR_SIZE); + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + ptr = (end_ptr as usize & !V::align_mask()) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + while loop_size == LOOP_AMT2 * V::size() && ptr >= start_ptr.add(loop_size) + { + debug_assert_eq!(0, (ptr as usize) % V::size()); ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let eqa1 = vn1.cmpeq(a); + let eqb1 = vn1.cmpeq(b); + let eqa2 = vn2.cmpeq(a); + let eqb2 = vn2.cmpeq(b); + let or1 = eqa1.or(eqb1); + let or2 = eqa2.or(eqb2); + let or3 = or1.or(or2); + if or3.movemask() != 0 { + let mut at = sub(ptr.add(V::size()), start_ptr); + let mask1 = eqb1.movemask(); + let mask2 = eqb2.movemask(); if mask1 != 0 || mask2 != 0 { - return Some(at + reverse_pos2(mask1, mask2)); + return Some(at + reverse_pos2::(mask1, mask2)); } - at -= VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - return Some(at + reverse_pos2(mask1, mask2)); + at -= V::size(); + let mask1 = eqa1.movemask(); + let mask2 = eqa2.movemask(); + return Some(at + reverse_pos2::(mask1, mask2)); } } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); + while ptr >= start_ptr.add(V::size()) { + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } } if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + debug_assert!(sub(ptr, start_ptr) < V::size()); return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr3( +#[inline(always)] +pub(crate) unsafe fn memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let vn3 = _mm_set1_epi8(n3 as i8); + let vn1 = V::splat(n1); + let vn2 = V::splat(n2); + let vn3 = V::splat(n3); let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); + let loop_size = cmp::min(LOOP_AMT2 * V::size(), len); let start_ptr = haystack.as_ptr(); let end_ptr = start_ptr.add(haystack.len()); let mut ptr = end_ptr; - if haystack.len() < VECTOR_SIZE { + if haystack.len() < V::size() { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 || *ptr == n3 { @@ -510,48 +524,49 @@ pub unsafe fn memrchr3( return None; } - ptr = ptr.sub(VECTOR_SIZE); + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + ptr = (end_ptr as usize & !V::align_mask()) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + while loop_size == LOOP_AMT2 * V::size() && ptr >= start_ptr.add(loop_size) + { + debug_assert_eq!(0, (ptr as usize) % V::size()); ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let eqa3 = _mm_cmpeq_epi8(vn3, a); - let eqb3 = _mm_cmpeq_epi8(vn3, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(eqa3, eqb3); - let or4 = _mm_or_si128(or1, or2); - let or5 = _mm_or_si128(or3, or4); - if _mm_movemask_epi8(or5) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - let mask3 = _mm_movemask_epi8(eqb3); + let a = V::load_aligned(ptr); + let b = V::load_aligned(ptr.add(V::size())); + let eqa1 = vn1.cmpeq(a); + let eqb1 = vn1.cmpeq(b); + let eqa2 = vn2.cmpeq(a); + let eqb2 = vn2.cmpeq(b); + let eqa3 = vn3.cmpeq(a); + let eqb3 = vn3.cmpeq(b); + let or1 = eqa1.or(eqb1); + let or2 = eqa2.or(eqb2); + let or3 = eqa3.or(eqb3); + let or4 = or1.or(or2); + let or5 = or3.or(or4); + if or5.movemask() != 0 { + let mut at = sub(ptr.add(V::size()), start_ptr); + let mask1 = eqb1.movemask(); + let mask2 = eqb2.movemask(); + let mask3 = eqb3.movemask(); if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return Some(at + reverse_pos3(mask1, mask2, mask3)); + return Some(at + reverse_pos3::(mask1, mask2, mask3)); } - at -= VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - let mask3 = _mm_movemask_epi8(eqa3); - return Some(at + reverse_pos3(mask1, mask2, mask3)); + at -= V::size(); + let mask1 = eqa1.movemask(); + let mask2 = eqa2.movemask(); + let mask3 = eqa3.movemask(); + return Some(at + reverse_pos3::(mask1, mask2, mask3)); } } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); + while ptr >= start_ptr.add(V::size()) { + ptr = ptr.sub(V::size()); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { @@ -559,25 +574,25 @@ pub unsafe fn memrchr3( } } if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + debug_assert!(sub(ptr, start_ptr) < V::size()); return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); } None } -#[target_feature(enable = "sse2")] -pub unsafe fn forward_search1( +#[inline(always)] +pub(crate) unsafe fn forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, + vn1: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + debug_assert!(ptr <= end_ptr.sub(V::size())); - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1)); + let chunk = V::load_unaligned(ptr); + let mask = chunk.cmpeq(vn1).movemask(); if mask != 0 { Some(sub(ptr, start_ptr) + forward_pos(mask)) } else { @@ -585,178 +600,182 @@ pub unsafe fn forward_search1( } } -#[target_feature(enable = "sse2")] -unsafe fn forward_search2( +#[inline(always)] +unsafe fn forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, - vn2: __m128i, + vn1: V, + vn2: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); + debug_assert!(ptr <= end_ptr.sub(V::size())); + + let chunk = V::load_unaligned(ptr); + let eq1 = chunk.cmpeq(vn1); + let eq2 = chunk.cmpeq(vn2); + if eq1.or(eq2).movemask() != 0 { + let mask1 = eq1.movemask(); + let mask2 = eq2.movemask(); Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) } else { None } } -#[target_feature(enable = "sse2")] -pub unsafe fn forward_search3( +#[inline(always)] +pub(crate) unsafe fn forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, - vn2: __m128i, - vn3: __m128i, + vn1: V, + vn2: V, + vn3: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - let eq3 = _mm_cmpeq_epi8(chunk, vn3); - let or = _mm_or_si128(eq1, eq2); - if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - let mask3 = _mm_movemask_epi8(eq3); + debug_assert!(ptr <= end_ptr.sub(V::size())); + + let chunk = V::load_unaligned(ptr); + let eq1 = chunk.cmpeq(vn1); + let eq2 = chunk.cmpeq(vn2); + let eq3 = chunk.cmpeq(vn3); + let or = eq1.or(eq2); + if or.or(eq3).movemask() != 0 { + let mask1 = eq1.movemask(); + let mask2 = eq2.movemask(); + let mask3 = eq3.movemask(); Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) } else { None } } -#[target_feature(enable = "sse2")] -unsafe fn reverse_search1( +#[inline(always)] +unsafe fn reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, + vn1: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + debug_assert!(ptr <= end_ptr.sub(V::size())); - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk)); + let chunk = V::load_unaligned(ptr); + let mask = vn1.cmpeq(chunk).movemask(); if mask != 0 { - Some(sub(ptr, start_ptr) + reverse_pos(mask)) + Some(sub(ptr, start_ptr) + reverse_pos::(mask)) } else { None } } -#[target_feature(enable = "sse2")] -unsafe fn reverse_search2( +#[inline(always)] +unsafe fn reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, - vn2: __m128i, + vn1: V, + vn2: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) + debug_assert!(ptr <= end_ptr.sub(V::size())); + + let chunk = V::load_unaligned(ptr); + let eq1 = chunk.cmpeq(vn1); + let eq2 = chunk.cmpeq(vn2); + if eq1.or(eq2).movemask() != 0 { + let mask1 = eq1.movemask(); + let mask2 = eq2.movemask(); + Some(sub(ptr, start_ptr) + reverse_pos2::(mask1, mask2)) } else { None } } -#[target_feature(enable = "sse2")] -unsafe fn reverse_search3( +#[inline(always)] +unsafe fn reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, - vn1: __m128i, - vn2: __m128i, - vn3: __m128i, + vn1: V, + vn2: V, + vn3: V, ) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(sub(end_ptr, start_ptr) >= V::size()); debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - let eq3 = _mm_cmpeq_epi8(chunk, vn3); - let or = _mm_or_si128(eq1, eq2); - if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - let mask3 = _mm_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) + debug_assert!(ptr <= end_ptr.sub(V::size())); + + let chunk = V::load_unaligned(ptr); + let eq1 = chunk.cmpeq(vn1); + let eq2 = chunk.cmpeq(vn2); + let eq3 = chunk.cmpeq(vn3); + let or = eq1.or(eq2); + if or.or(eq3).movemask() != 0 { + let mask1 = eq1.movemask(); + let mask2 = eq2.movemask(); + let mask3 = eq3.movemask(); + Some(sub(ptr, start_ptr) + reverse_pos3::(mask1, mask2, mask3)) } else { None } } /// Compute the position of the first matching byte from the given mask. The -/// position returned is always in the range [0, 15]. +/// position returned is always in the range [0, V::size()). /// /// The mask given is expected to be the result of _mm_movemask_epi8. -fn forward_pos(mask: i32) -> usize { +fn forward_pos(mask: u32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the least significant bit that is set // corresponds to the position of our first matching byte. That position // corresponds to the number of zeros after the least significant bit. + assert!(cfg!(target_endian = "little")); mask.trailing_zeros() as usize } /// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to +/// position returned is always in the range [0, V::size()). Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos2(mask1: i32, mask2: i32) -> usize { +fn forward_pos2(mask1: u32, mask2: u32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); forward_pos(mask1 | mask2) } /// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to +/// position returned is always in the range [0, V::size()). Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { +fn forward_pos3(mask1: u32, mask2: u32, mask3: u32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); forward_pos(mask1 | mask2 | mask3) } /// Compute the position of the last matching byte from the given mask. The -/// position returned is always in the range [0, 15]. +/// position returned is always in the range [0, V::size()). /// /// The mask given is expected to be the result of _mm_movemask_epi8. -fn reverse_pos(mask: i32) -> usize { +fn reverse_pos(mask: u32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the most significant bit that is set // corresponds to the position of our last matching byte. The position from - // the end of the mask is therefore the number of leading zeros in a 16 + // the end of the mask is therefore the number of leading zeros in a 32 // bit integer, and the position from the start of the mask is therefore - // 16 - (leading zeros) - 1. - VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1 + // size - (leading zeros) - 1. + let r = 31 - mask.leading_zeros() as usize; + return r; + // let r = V::size() - mask.leading_zeros() as usize - 1; + // return r; } /// Compute the position of the last matching byte from the given masks. The @@ -765,10 +784,10 @@ fn reverse_pos(mask: i32) -> usize { /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos2(mask1: i32, mask2: i32) -> usize { +fn reverse_pos2(mask1: u32, mask2: u32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); - reverse_pos(mask1 | mask2) + reverse_pos::(mask1 | mask2) } /// Compute the position of the last matching byte from the given masks. The @@ -777,10 +796,10 @@ fn reverse_pos2(mask1: i32, mask2: i32) -> usize { /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { +fn reverse_pos3(mask1: u32, mask2: u32, mask3: u32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - reverse_pos(mask1 | mask2 | mask3) + reverse_pos::(mask1 | mask2 | mask3) } /// Subtract `b` from `a` and return the difference. `a` should be greater than diff --git a/src/memchr/mod.rs b/src/memchr/mod.rs index 09ce6ef..184b7ad 100644 --- a/src/memchr/mod.rs +++ b/src/memchr/mod.rs @@ -7,10 +7,9 @@ pub use self::iter::{Memchr, Memchr2, Memchr3}; mod c; #[allow(dead_code)] pub mod fallback; +mod genericsimd; mod iter; pub mod naive; -#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] -mod x86; /// An iterator over all occurrences of the needle in a haystack. #[inline] @@ -62,6 +61,64 @@ pub fn memrchr3_iter( Memchr3::new(needle1, needle2, needle3, haystack).rev() } +macro_rules! delegate { + ($method:ident($($param:ident: $ty:ty),*) $($ret:tt)*) => ({ + if cfg!(miri) { + return naive::$method($($param),*); + } + if cfg!(memchr_runtime_simd) { + #[cfg(target_arch = "x86_64")] + { + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + enable_target_feature_and_call!( + "avx2", + core::arch::x86_64::__m256i, + $method($($param: $ty),*) $($ret)* + ); + } + } + enable_target_feature_and_call!( + "sse2", + core::arch::x86_64::__m128i, + $method($($param: $ty),*) $($ret)* + ); + } + #[cfg(target_arch = "wasm32")] + enable_target_feature_and_call!( + "simd128", + core::arch::wasm32::v128, + $method($($param: $ty),*) $($ret)* + ); + } + maybe_delegate_libc!($method($($param),*)); + fallback::$method($($param),*) + }) +} + +macro_rules! enable_target_feature_and_call { + ($feature:tt, $vector:ty, $method:ident($($param:ident: $ty:ty),*) $($ret:tt)*) => { + #[target_feature(enable = $feature)] + unsafe fn $method($($param: $ty),*) $($ret)* { + genericsimd::$method::<$vector>($($param),*) + } + return unsafe { $method($($param),*) }; + } +} + +macro_rules! maybe_delegate_libc { + (memchr($($param:tt)*)) => ( + #[cfg(memchr_libc)] + return c::memchr($($param)*); + ); + (memrchr($($param:tt)*)) => ( + #[cfg(memchr_libc)] + return c::memrchr($($param)*); + ); + ($($other:tt)*) => (); +} + /// Search for the first occurrence of a byte in a slice. /// /// This returns the index corresponding to the first occurrence of `needle` in @@ -85,43 +142,10 @@ pub fn memrchr3_iter( /// ``` #[inline] pub fn memchr(needle: u8, haystack: &[u8]) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - naive::memchr(n1, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - x86::memchr(n1, haystack) - } - - #[cfg(all( - memchr_libc, - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - c::memchr(n1, haystack) - } - - #[cfg(all( - not(memchr_libc), - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - fallback::memchr(n1, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle, haystack) + return None; } + delegate!(memchr(needle: u8, haystack: &[u8]) -> Option) } /// Like `memchr`, but searches for either of two bytes instead of just one. @@ -149,32 +173,10 @@ pub fn memchr(needle: u8, haystack: &[u8]) -> Option { /// ``` #[inline] pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - naive::memchr2(n1, n2, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - x86::memchr2(n1, n2, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - fallback::memchr2(n1, n2, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, haystack) + return None; } + delegate!(memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option) } /// Like `memchr`, but searches for any of three bytes instead of just one. @@ -207,32 +209,10 @@ pub fn memchr3( needle3: u8, haystack: &[u8], ) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - naive::memchr3(n1, n2, n3, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - x86::memchr3(n1, n2, n3, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - fallback::memchr3(n1, n2, n3, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, needle3, haystack) + return None; } + delegate!(memchr3(needle1: u8, needle2: u8, needle3: u8, haystack: &[u8]) -> Option) } /// Search for the last occurrence of a byte in a slice. @@ -258,44 +238,10 @@ pub fn memchr3( /// ``` #[inline] pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - naive::memrchr(n1, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - x86::memrchr(n1, haystack) - } - - #[cfg(all( - memchr_libc, - target_os = "linux", - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri) - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - c::memrchr(n1, haystack) - } - - #[cfg(all( - not(all(memchr_libc, target_os = "linux")), - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option { - fallback::memrchr(n1, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle, haystack) + return None; } + delegate!(memrchr(needle: u8, haystack: &[u8]) -> Option) } /// Like `memrchr`, but searches for either of two bytes instead of just one. @@ -323,32 +269,10 @@ pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { /// ``` #[inline] pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - naive::memrchr2(n1, n2, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - x86::memrchr2(n1, n2, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { - fallback::memrchr2(n1, n2, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, haystack) + return None; } + delegate!(memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option) } /// Like `memrchr`, but searches for any of three bytes instead of just one. @@ -381,30 +305,8 @@ pub fn memrchr3( needle3: u8, haystack: &[u8], ) -> Option { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - naive::memrchr3(n1, n2, n3, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - x86::memrchr3(n1, n2, n3, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - fallback::memrchr3(n1, n2, n3, haystack) - } - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, needle3, haystack) + return None; } + delegate!(memrchr3(needle1: u8, needle2: u8, needle3: u8, haystack: &[u8]) -> Option) } diff --git a/src/memchr/x86/avx.rs b/src/memchr/x86/avx.rs deleted file mode 100644 index 5351230..0000000 --- a/src/memchr/x86/avx.rs +++ /dev/null @@ -1,755 +0,0 @@ -use core::{arch::x86_64::*, cmp, mem::size_of}; - -use super::sse2; - -const VECTOR_SIZE: usize = size_of::<__m256i>(); -const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; - -// The number of bytes to loop at in one iteration of memchr/memrchr. -const LOOP_SIZE: usize = 4 * VECTOR_SIZE; - -// The number of bytes to loop at in one iteration of memchr2/memrchr2 and -// memchr3/memrchr3. There was no observable difference between 128 and 64 -// bytes in benchmarks. memchr3 in particular only gets a very slight speed up -// from the loop unrolling. -const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { - // For a high level explanation for how this algorithm works, see the - // sse2 implementation. The avx implementation here is the same, but with - // 256-bit vectors instead of 128-bit vectors. - - // This routine is called whenever a match is detected. It is specifically - // marked as unlineable because it improves the codegen of the unrolled - // loop below. Inlining this seems to cause codegen with some extra adds - // and a load that aren't necessary. This seems to result in about a 10% - // improvement for the memchr1/crate/huge/never benchmark. - // - // Interestingly, I couldn't observe a similar improvement for memrchr. - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa: __m256i, - eqb: __m256i, - eqc: __m256i, - eqd: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask = _mm256_movemask_epi8(eqa); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqb); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqc); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqd); - debug_assert!(mask != 0); - at + forward_pos(mask) - } - - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - // For small haystacks, defer to the SSE2 implementation. Codegen - // suggests this completely avoids touching the AVX vectors. - return sse2::memchr(n1, haystack); - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let loop_size = cmp::min(LOOP_SIZE, haystack.len()); - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); - let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); - let eqa = _mm256_cmpeq_epi8(vn1, a); - let eqb = _mm256_cmpeq_epi8(vn1, b); - let eqc = _mm256_cmpeq_epi8(vn1, c); - let eqd = _mm256_cmpeq_epi8(vn1, d); - let or1 = _mm256_or_si256(eqa, eqb); - let or2 = _mm256_or_si256(eqc, eqd); - let or3 = _mm256_or_si256(or1, or2); - - if _mm256_movemask_epi8(or3) != 0 { - return Some(matched(start_ptr, ptr, eqa, eqb, eqc, eqd)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); - - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search1(start_ptr, end_ptr, ptr, vn1); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa1: __m256i, - eqa2: __m256i, - eqb1: __m256i, - eqb2: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - if mask1 != 0 || mask2 != 0 { - return at + forward_pos2(mask1, mask2); - } - - at += VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - at + forward_pos2(mask1, mask2) - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - return Some(matched(start_ptr, ptr, eqa1, eqa2, eqb1, eqb2)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option { - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa1: __m256i, - eqa2: __m256i, - eqa3: __m256i, - eqb1: __m256i, - eqb2: __m256i, - eqb3: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - let mask3 = _mm256_movemask_epi8(eqa3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return at + forward_pos3(mask1, mask2, mask3); - } - - at += VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - let mask3 = _mm256_movemask_epi8(eqb3); - at + forward_pos3(mask1, mask2, mask3) - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let vn3 = _mm256_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let eqa3 = _mm256_cmpeq_epi8(vn3, a); - let eqb3 = _mm256_cmpeq_epi8(vn3, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(eqa3, eqb3); - let or4 = _mm256_or_si256(or1, or2); - let or5 = _mm256_or_si256(or3, or4); - if _mm256_movemask_epi8(or5) != 0 { - return Some(matched( - start_ptr, ptr, eqa1, eqa2, eqa3, eqb1, eqb2, eqb3, - )); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = - forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { - let vn1 = _mm256_set1_epi8(n1 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); - let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); - let eqa = _mm256_cmpeq_epi8(vn1, a); - let eqb = _mm256_cmpeq_epi8(vn1, b); - let eqc = _mm256_cmpeq_epi8(vn1, c); - let eqd = _mm256_cmpeq_epi8(vn1, d); - let or1 = _mm256_or_si256(eqa, eqb); - let or2 = _mm256_or_si256(eqc, eqd); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); - let mask = _mm256_movemask_epi8(eqd); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqc); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqb); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqa); - debug_assert!(mask != 0); - return Some(at + reverse_pos(mask)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - if mask1 != 0 || mask2 != 0 { - return Some(at + reverse_pos2(mask1, mask2)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - return Some(at + reverse_pos2(mask1, mask2)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option { - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let vn3 = _mm256_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let eqa3 = _mm256_cmpeq_epi8(vn3, a); - let eqb3 = _mm256_cmpeq_epi8(vn3, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(eqa3, eqb3); - let or4 = _mm256_or_si256(or1, or2); - let or5 = _mm256_or_si256(or3, or4); - if _mm256_movemask_epi8(or5) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - let mask3 = _mm256_movemask_epi8(eqb3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - let mask3 = _mm256_movemask_epi8(eqa3); - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = - reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1)); - if mask != 0 { - Some(sub(ptr, start_ptr) + forward_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, - vn3: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - let eq3 = _mm256_cmpeq_epi8(chunk, vn3); - let or = _mm256_or_si256(eq1, eq2); - if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - let mask3 = _mm256_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk)); - if mask != 0 { - Some(sub(ptr, start_ptr) + reverse_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, - vn3: __m256i, -) -> Option { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - let eq3 = _mm256_cmpeq_epi8(chunk, vn3); - let or = _mm256_or_si256(eq1, eq2); - if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - let mask3 = _mm256_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -/// Compute the position of the first matching byte from the given mask. The -/// position returned is always in the range [0, 31]. -/// -/// The mask given is expected to be the result of _mm256_movemask_epi8. -fn forward_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the least significant bit that is set - // corresponds to the position of our first matching byte. That position - // corresponds to the number of zeros after the least significant bit. - mask.trailing_zeros() as usize -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - forward_pos(mask1 | mask2) -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - forward_pos(mask1 | mask2 | mask3) -} - -/// Compute the position of the last matching byte from the given mask. The -/// position returned is always in the range [0, 31]. -/// -/// The mask given is expected to be the result of _mm256_movemask_epi8. -fn reverse_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the most significant bit that is set - // corresponds to the position of our last matching byte. The position from - // the end of the mask is therefore the number of leading zeros in a 32 - // bit integer, and the position from the start of the mask is therefore - // 32 - (leading zeros) - 1. - VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1 -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - reverse_pos(mask1 | mask2) -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - reverse_pos(mask1 | mask2 | mask3) -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} diff --git a/src/memchr/x86/mod.rs b/src/memchr/x86/mod.rs deleted file mode 100644 index aec35db..0000000 --- a/src/memchr/x86/mod.rs +++ /dev/null @@ -1,148 +0,0 @@ -use super::fallback; - -// We only use AVX when we can detect at runtime whether it's available, which -// requires std. -#[cfg(feature = "std")] -mod avx; -mod sse2; - -/// This macro employs a gcc-like "ifunc" trick where by upon first calling -/// `memchr` (for example), CPU feature detection will be performed at runtime -/// to determine the best implementation to use. After CPU feature detection -/// is done, we replace `memchr`'s function pointer with the selection. Upon -/// subsequent invocations, the CPU-specific routine is invoked directly, which -/// skips the CPU feature detection and subsequent branch that's required. -/// -/// While this typically doesn't matter for rare occurrences or when used on -/// larger haystacks, `memchr` can be called in tight loops where the overhead -/// of this branch can actually add up *and is measurable*. This trick was -/// necessary to bring this implementation up to glibc's speeds for the 'tiny' -/// benchmarks, for example. -/// -/// At some point, I expect the Rust ecosystem will get a nice macro for doing -/// exactly this, at which point, we can replace our hand-jammed version of it. -/// -/// N.B. The ifunc strategy does prevent function inlining of course, but -/// on modern CPUs, you'll probably end up with the AVX2 implementation, -/// which probably can't be inlined anyway---unless you've compiled your -/// entire program with AVX2 enabled. However, even then, the various memchr -/// implementations aren't exactly small, so inlining might not help anyway! -/// -/// # Safety -/// -/// Callers must ensure that fnty is function pointer type. -#[cfg(feature = "std")] -macro_rules! unsafe_ifunc { - ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ - use std::{mem, sync::atomic::{AtomicPtr, Ordering}}; - - type FnRaw = *mut (); - - static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); - - fn detect($($needle: u8),+, haystack: &[u8]) -> Option { - let fun = - if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { - avx::$name as FnRaw - } else if cfg!(memchr_runtime_sse2) { - sse2::$name as FnRaw - } else { - fallback::$name as FnRaw - }; - FN.store(fun as FnRaw, Ordering::Relaxed); - // SAFETY: By virtue of the caller contract, $fnty is a function - // pointer, which is always safe to transmute with a *mut (). - // Also, if 'fun is the AVX routine, then it is guaranteed to be - // supported since we checked the avx2 feature. - unsafe { - mem::transmute::(fun)($($needle),+, haystack) - } - } - - // SAFETY: By virtue of the caller contract, $fnty is a function - // pointer, which is always safe to transmute with a *mut (). Also, if - // 'fun is the AVX routine, then it is guaranteed to be supported since - // we checked the avx2 feature. - unsafe { - let fun = FN.load(Ordering::Relaxed); - mem::transmute::(fun)($($needle),+, $haystack) - } - }} -} - -/// When std isn't available to provide runtime CPU feature detection, or if -/// runtime CPU feature detection has been explicitly disabled, then just -/// call our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 -/// targets, so no CPU feature detection is necessary. -/// -/// # Safety -/// -/// There are no safety requirements for this definition of the macro. It is -/// safe for all inputs since it is restricted to either the fallback routine -/// or the SSE routine, which is always safe to call on x86_64. -#[cfg(not(feature = "std"))] -macro_rules! unsafe_ifunc { - ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ - if cfg!(memchr_runtime_sse2) { - unsafe { sse2::$name($($needle),+, $haystack) } - } else { - fallback::$name($($needle),+, $haystack) - } - }} -} - -#[inline(always)] -pub fn memchr(n1: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!(fn(u8, &[u8]) -> Option, memchr, haystack, n1) -} - -#[inline(always)] -pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!( - fn(u8, u8, &[u8]) -> Option, - memchr2, - haystack, - n1, - n2 - ) -} - -#[inline(always)] -pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!( - fn(u8, u8, u8, &[u8]) -> Option, - memchr3, - haystack, - n1, - n2, - n3 - ) -} - -#[inline(always)] -pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!(fn(u8, &[u8]) -> Option, memrchr, haystack, n1) -} - -#[inline(always)] -pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!( - fn(u8, u8, &[u8]) -> Option, - memrchr2, - haystack, - n1, - n2 - ) -} - -#[inline(always)] -pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { - unsafe_ifunc!( - fn(u8, u8, u8, &[u8]) -> Option, - memrchr3, - haystack, - n1, - n2, - n3 - ) -} diff --git a/src/memmem/genericsimd.rs b/src/memmem/genericsimd.rs index 28bfdab..eb31785 100644 --- a/src/memmem/genericsimd.rs +++ b/src/memmem/genericsimd.rs @@ -1,6 +1,7 @@ use core::mem::size_of; -use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo}; +use crate::memmem::{util::memcmp, NeedleInfo}; +use crate::vector::Vector; /// The minimum length of a needle required for this algorithm. The minimum /// is 2 since a length of 1 should just use memchr and a length of 0 isn't diff --git a/src/memmem/mod.rs b/src/memmem/mod.rs index 0dd6186..6c5c10a 100644 --- a/src/memmem/mod.rs +++ b/src/memmem/mod.rs @@ -153,9 +153,6 @@ mod rabinkarp; mod rarebytes; mod twoway; mod util; -// SIMD is only supported on x86_64 currently. -#[cfg(target_arch = "x86_64")] -mod vector; #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] mod x86; diff --git a/src/memmem/prefilter/genericsimd.rs b/src/memmem/prefilter/genericsimd.rs index 1a6e387..abdfe23 100644 --- a/src/memmem/prefilter/genericsimd.rs +++ b/src/memmem/prefilter/genericsimd.rs @@ -2,9 +2,9 @@ use core::mem::size_of; use crate::memmem::{ prefilter::{PrefilterFnTy, PrefilterState}, - vector::Vector, NeedleInfo, }; +use crate::vector::Vector; /// The implementation of the forward vector accelerated candidate finder. /// diff --git a/src/tests/memchr/iter.rs b/src/tests/memchr/iter.rs index 80ea5c2..182b555 100644 --- a/src/tests/memchr/iter.rs +++ b/src/tests/memchr/iter.rs @@ -228,3 +228,17 @@ fn positions3<'a>( .map(|t| t.0); Box::new(it) } + +#[test] +fn wat() { + let data = [ + 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 255, + 1, 1, 1, 255, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + ]; + let needle = 0; + let answer = positions1(needle, &data); + println!("{:?}", positions1(needle, &data).collect::>()); + println!("{:?}", Memchr::new(needle, &data).collect::>()); + assert!(answer.eq(Memchr::new(needle, &data))); +} diff --git a/src/memmem/vector.rs b/src/vector.rs similarity index 75% rename from src/memmem/vector.rs rename to src/vector.rs index a67d3c5..9ff0ac3 100644 --- a/src/memmem/vector.rs +++ b/src/vector.rs @@ -19,8 +19,22 @@ /// to ensure they get appropriately inlined. (inline(always) cannot be used /// with target_feature.) pub(crate) trait Vector: Copy + core::fmt::Debug { + /// Returns the size of this vector, in bytes + #[inline] + fn size() -> usize { + core::mem::size_of::() + } + + /// Returns a mask used to align pointers to this vector's alignment + #[inline] + fn align_mask() -> usize { + Self::size() - 1 + } + /// _mm_set1_epi8 or _mm256_set1_epi8 unsafe fn splat(byte: u8) -> Self; + /// _mm_load_si128 or _mm256_load_si256 + unsafe fn load_aligned(data: *const u8) -> Self; /// _mm_loadu_si128 or _mm256_loadu_si256 unsafe fn load_unaligned(data: *const u8) -> Self; /// _mm_movemask_epi8 or _mm256_movemask_epi8 @@ -29,6 +43,8 @@ pub(crate) trait Vector: Copy + core::fmt::Debug { unsafe fn cmpeq(self, vector2: Self) -> Self; /// _mm_and_si128 or _mm256_and_si256 unsafe fn and(self, vector2: Self) -> Self; + /// _mm_or_si128 or _mm256_or_si256 + unsafe fn or(self, vector2: Self) -> Self; } #[cfg(target_arch = "x86_64")] @@ -42,6 +58,11 @@ mod x86sse { _mm_set1_epi8(byte as i8) } + #[inline(always)] + unsafe fn load_aligned(data: *const u8) -> __m128i { + _mm_load_si128(data as *const __m128i) + } + #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> __m128i { _mm_loadu_si128(data as *const __m128i) @@ -61,6 +82,11 @@ mod x86sse { unsafe fn and(self, vector2: Self) -> __m128i { _mm_and_si128(self, vector2) } + + #[inline(always)] + unsafe fn or(self, vector2: Self) -> __m128i { + _mm_or_si128(self, vector2) + } } } @@ -75,6 +101,11 @@ mod x86avx { _mm256_set1_epi8(byte as i8) } + #[inline(always)] + unsafe fn load_aligned(data: *const u8) -> __m256i { + _mm256_load_si256(data as *const __m256i) + } + #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> __m256i { _mm256_loadu_si256(data as *const __m256i) @@ -94,5 +125,10 @@ mod x86avx { unsafe fn and(self, vector2: Self) -> __m256i { _mm256_and_si256(self, vector2) } + + #[inline(always)] + unsafe fn or(self, vector2: Self) -> __m256i { + _mm256_or_si256(self, vector2) + } } }