Skip to content

Commit

Permalink
Improve LIKE performance for "contains" style queries (#6128)
Browse files Browse the repository at this point in the history
* improve "contains" performance

* add tests

* cargo fmt 😞

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
samuelcolvin and alamb authored Jul 29, 2024
1 parent 0e99e3a commit bf9ce47
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
2 changes: 1 addition & 1 deletion arrow-string/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ arrow-select = { workspace = true }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
regex-syntax = { version = "0.8.0", default-features = false, features = ["unicode"] }
num = { version = "0.4", default-features = false, features = ["std"] }
memchr = "2.7.1"
memchr = "2.7.4"
8 changes: 6 additions & 2 deletions arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ fn op_scalar<'a, T: StringArrayType<'a>>(
let r = match op {
Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg),
Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg),
Op::Contains => Predicate::Contains(r).evaluate_array(l, false),
Op::Contains => Predicate::contains(r).evaluate_array(l, false),
Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false),
Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false),
};
Expand Down Expand Up @@ -273,12 +273,16 @@ fn op_binary<'a>(
match op {
Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like),
Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)),
Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(l?.contains(r?))).collect()),
Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, r?))).collect()),
Op::StartsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.starts_with(r?))).collect()),
Op::EndsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.ends_with(r?))).collect()),
}
}

fn str_contains(haystack: &str, needle: &str) -> bool {
memchr::memmem::find(haystack.as_bytes(), needle.as_bytes()).is_some()
}

fn binary_predicate<'a>(
l: impl Iterator<Item = Option<&'a str>>,
r: impl Iterator<Item = Option<&'a str>>,
Expand Down
35 changes: 29 additions & 6 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
use arrow_array::{ArrayAccessor, BooleanArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
use regex::{Regex, RegexBuilder};

/// A string based predicate
pub enum Predicate<'a> {
Eq(&'a str),
Contains(&'a str),
Contains(Finder<'a>),
StartsWith(&'a str),
EndsWith(&'a str),

Expand Down Expand Up @@ -54,12 +55,16 @@ impl<'a> Predicate<'a> {
&& !pattern.ends_with("\\%")
&& !contains_like_pattern(&pattern[1..pattern.len() - 1])
{
Ok(Self::Contains(&pattern[1..pattern.len() - 1]))
Ok(Self::contains(&pattern[1..pattern.len() - 1]))
} else {
Ok(Self::Regex(regex_like(pattern, false)?))
}
}

pub fn contains(needle: &'a str) -> Self {
Self::Contains(Finder::new(needle.as_bytes()))
}

/// Create a predicate for the given ilike pattern
pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError> {
if is_ascii && pattern.is_ascii() {
Expand All @@ -82,7 +87,7 @@ impl<'a> Predicate<'a> {
match self {
Predicate::Eq(v) => *v == haystack,
Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v),
Predicate::Contains(v) => haystack.contains(v),
Predicate::Contains(finder) => finder.find(haystack.as_bytes()).is_some(),
Predicate::StartsWith(v) => haystack.starts_with(v),
Predicate::IStartsWithAscii(v) => starts_with_ignore_ascii_case(haystack, v),
Predicate::EndsWith(v) => haystack.ends_with(v),
Expand All @@ -106,9 +111,9 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
Predicate::Contains(v) => {
BooleanArray::from_unary(array, |haystack| haystack.contains(v) != negate)
}
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
}),
Predicate::StartsWith(v) => {
BooleanArray::from_unary(array, |haystack| haystack.starts_with(v) != negate)
}
Expand Down Expand Up @@ -258,4 +263,22 @@ mod tests {
let r = regex_like(a_eq, false).unwrap();
assert_eq!(r.to_string(), expected);
}
#[test]
fn test_contains() {
assert!(Predicate::contains("hay").evaluate("haystack"));
assert!(Predicate::contains("haystack").evaluate("haystack"));
assert!(Predicate::contains("h").evaluate("haystack"));
assert!(Predicate::contains("k").evaluate("haystack"));
assert!(Predicate::contains("stack").evaluate("haystack"));
assert!(Predicate::contains("sta").evaluate("haystack"));
assert!(Predicate::contains("stack").evaluate("hay£stack"));
assert!(Predicate::contains("y£s").evaluate("hay£stack"));
assert!(Predicate::contains("£").evaluate("hay£stack"));
assert!(Predicate::contains("a").evaluate("a"));
// not matching
assert!(!Predicate::contains("hy").evaluate("haystack"));
assert!(!Predicate::contains("stackx").evaluate("haystack"));
assert!(!Predicate::contains("x").evaluate("haystack"));
assert!(!Predicate::contains("haystack haystack").evaluate("haystack"));
}
}

0 comments on commit bf9ce47

Please sign in to comment.