From 12ca3f07d8e486275aa8871ca2c57a470460e11c Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 8 Nov 2024 17:59:01 +0100 Subject: [PATCH] Fix string view LIKE checks with NULL values (#6662) * Fix string view LIKE checks with NULL values * Add TODO comments to operations yet to be fixed * Use from_unary for contains with string view from_unary takes care of nulls * Fix performance * fixup! Fix performance --- arrow-string/src/like.rs | 119 +++++++++++++++++++++++++++++++++- arrow-string/src/predicate.rs | 34 +++++----- 2 files changed, 133 insertions(+), 20 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 5c76d5c810a7..fee3b792a946 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -1708,7 +1708,93 @@ mod tests { } #[test] - fn like_scalar_null() { + fn string_null_like_pattern() { + // Different patterns have different execution code paths + for pattern in &[ + "", // can execute as equality check + "_", // can execute as length check + "%", // can execute as starts_with("") or non-null check + "a%", // can execute as starts_with("a") + "%a", // can execute as ends_with("") + "a%b", // can execute as starts_with("a") && ends_with("b") + "%a%", // can_execute as contains("a") + "%a%b_c_d%e", // can_execute as regular expression + ] { + let a = Scalar::new(StringArray::new_null(1)); + let b = StringArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(StringArray::new_null(1)); + let b = StringArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringArray::new_null(1); + let b = StringArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringArray::new_null(1); + let b = StringArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + } + } + + #[test] + fn string_view_null_like_pattern() { + // Different patterns have different execution code paths + for pattern in &[ + "", // can execute as equality check + "_", // can execute as length check + "%", // can execute as starts_with("") or non-null check + "a%", // can execute as starts_with("a") + "%a", // can execute as ends_with("") + "a%b", // can execute as starts_with("a") && ends_with("b") + "%a%", // can_execute as contains("a") + "%a%b_c_d%e", // can_execute as regular expression + ] { + let a = Scalar::new(StringViewArray::new_null(1)); + let b = StringViewArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(StringViewArray::new_null(1)); + let b = StringViewArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringViewArray::new_null(1); + let b = StringViewArray::from_iter_values([pattern]); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = StringViewArray::new_null(1); + let b = StringViewArray::new_scalar(pattern); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + } + } + + #[test] + fn string_like_scalar_null() { let a = StringArray::new_scalar("a"); let b = Scalar::new(StringArray::new_null(1)); let r = like(&a, &b).unwrap(); @@ -1737,4 +1823,35 @@ mod tests { assert_eq!(r.null_count(), 1); assert!(r.is_null(0)); } + + #[test] + fn string_view_like_scalar_null() { + let a = StringViewArray::new_scalar("a"); + let b = Scalar::new(StringViewArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::from_iter_values(["a"]); + let b = Scalar::new(StringViewArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::from_iter_values(["a"]); + let b = StringViewArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringViewArray::new_scalar("a"); + let b = StringViewArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + } } diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 408d9d45cc75..1d141e6b0b21 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray}; +use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; +use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -114,30 +115,21 @@ impl<'a> Predicate<'a> { Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(finder) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - BooleanArray::from( - string_view_array - .bytes_iter() - .map(|haystack| finder.find(haystack).is_some() != negate) - .collect::>(), - ) - } else { - BooleanArray::from_unary(array, |haystack| { - finder.find(haystack.as_bytes()).is_some() != negate - }) - } - } + Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }), Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - BooleanArray::from( + let nulls = string_view_array.logical_nulls(); + let values = BooleanBuffer::from( string_view_array .prefix_bytes_iter(v.len()) .map(|haystack| { equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate }) .collect::>(), - ) + ); + BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { starts_with(haystack, v, equals_kernel) != negate @@ -146,6 +138,7 @@ impl<'a> Predicate<'a> { } Predicate::IStartsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + // TODO respect null buffer BooleanArray::from( string_view_array .prefix_bytes_iter(v.len()) @@ -166,14 +159,16 @@ impl<'a> Predicate<'a> { } Predicate::EndsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - BooleanArray::from( + let nulls = string_view_array.logical_nulls(); + let values = BooleanBuffer::from( string_view_array .suffix_bytes_iter(v.len()) .map(|haystack| { equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate }) .collect::>(), - ) + ); + BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { ends_with(haystack, v, equals_kernel) != negate @@ -182,6 +177,7 @@ impl<'a> Predicate<'a> { } Predicate::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { + // TODO respect null buffer BooleanArray::from( string_view_array .suffix_bytes_iter(v.len())