From bf9ce475df82d362631099d491d3454d64d50217 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 29 Jul 2024 22:08:30 +0100 Subject: [PATCH] Improve `LIKE` performance for "contains" style queries (#6128) * improve "contains" performance * add tests * cargo fmt :disappointed: --------- Co-authored-by: Andrew Lamb --- arrow-string/Cargo.toml | 2 +- arrow-string/src/like.rs | 8 ++++++-- arrow-string/src/predicate.rs | 35 +++++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index bdfa681113b4..0757067dc898 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -42,4 +42,4 @@ arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.8.0", default-features = false, features = ["unicode"] } num = { version = "0.4", default-features = false, features = ["std"] } -memchr = "2.7.1" +memchr = "2.7.4" diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 49831092ffcd..d23040b592da 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -239,7 +239,7 @@ fn op_scalar<'a, T: StringArrayType<'a>>( let r = match op { Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg), Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), - Op::Contains => Predicate::Contains(r).evaluate_array(l, false), + Op::Contains => Predicate::contains(r).evaluate_array(l, false), Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false), Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false), }; @@ -273,12 +273,16 @@ fn op_binary<'a>( match op { Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), - Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(l?.contains(r?))).collect()), + Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, r?))).collect()), Op::StartsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.starts_with(r?))).collect()), Op::EndsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.ends_with(r?))).collect()), } } +fn str_contains(haystack: &str, needle: &str) -> bool { + memchr::memmem::find(haystack.as_bytes(), needle.as_bytes()).is_some() +} + fn binary_predicate<'a>( l: impl Iterator>, r: impl Iterator>, diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index c7ccffb3ada4..0769da3fa0da 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -18,12 +18,13 @@ use arrow_array::{ArrayAccessor, BooleanArray}; use arrow_schema::ArrowError; use memchr::memchr2; +use memchr::memmem::Finder; use regex::{Regex, RegexBuilder}; /// A string based predicate pub enum Predicate<'a> { Eq(&'a str), - Contains(&'a str), + Contains(Finder<'a>), StartsWith(&'a str), EndsWith(&'a str), @@ -54,12 +55,16 @@ impl<'a> Predicate<'a> { && !pattern.ends_with("\\%") && !contains_like_pattern(&pattern[1..pattern.len() - 1]) { - Ok(Self::Contains(&pattern[1..pattern.len() - 1])) + Ok(Self::contains(&pattern[1..pattern.len() - 1])) } else { Ok(Self::Regex(regex_like(pattern, false)?)) } } + pub fn contains(needle: &'a str) -> Self { + Self::Contains(Finder::new(needle.as_bytes())) + } + /// Create a predicate for the given ilike pattern pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result { if is_ascii && pattern.is_ascii() { @@ -82,7 +87,7 @@ impl<'a> Predicate<'a> { match self { Predicate::Eq(v) => *v == haystack, Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Predicate::Contains(v) => haystack.contains(v), + Predicate::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), Predicate::StartsWith(v) => haystack.starts_with(v), Predicate::IStartsWithAscii(v) => starts_with_ignore_ascii_case(haystack, v), Predicate::EndsWith(v) => haystack.ends_with(v), @@ -106,9 +111,9 @@ impl<'a> Predicate<'a> { Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(v) => { - BooleanArray::from_unary(array, |haystack| haystack.contains(v) != negate) - } + Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }), Predicate::StartsWith(v) => { BooleanArray::from_unary(array, |haystack| haystack.starts_with(v) != negate) } @@ -258,4 +263,22 @@ mod tests { let r = regex_like(a_eq, false).unwrap(); assert_eq!(r.to_string(), expected); } + #[test] + fn test_contains() { + assert!(Predicate::contains("hay").evaluate("haystack")); + assert!(Predicate::contains("haystack").evaluate("haystack")); + assert!(Predicate::contains("h").evaluate("haystack")); + assert!(Predicate::contains("k").evaluate("haystack")); + assert!(Predicate::contains("stack").evaluate("haystack")); + assert!(Predicate::contains("sta").evaluate("haystack")); + assert!(Predicate::contains("stack").evaluate("hay£stack")); + assert!(Predicate::contains("y£s").evaluate("hay£stack")); + assert!(Predicate::contains("£").evaluate("hay£stack")); + assert!(Predicate::contains("a").evaluate("a")); + // not matching + assert!(!Predicate::contains("hy").evaluate("haystack")); + assert!(!Predicate::contains("stackx").evaluate("haystack")); + assert!(!Predicate::contains("x").evaluate("haystack")); + assert!(!Predicate::contains("haystack haystack").evaluate("haystack")); + } }