From 6eb924676f453b531e85fb4589cd372e9014539b Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Wed, 22 Dec 2021 09:51:04 +0100 Subject: [PATCH 1/4] Fix like regex escaping --- arrow/src/compute/kernels/comparison.rs | 49 +++++++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index ea48007211f5..de3786db2713 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -32,7 +32,7 @@ use crate::datatypes::{ }; use crate::error::{ArrowError, Result}; use crate::util::bit_util; -use regex::Regex; +use regex::{escape, Regex}; use std::any::type_name; use std::collections::HashMap; @@ -259,14 +259,14 @@ where let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { + let pat = escape(right.value(i)); + let re = if let Some(ref regex) = map.get(&pat) { regex } else { let re_pattern = pat.replace("%", ".*").replace("_", "."); let re = op(&re_pattern)?; - map.insert(pat, re); - map.get(pat).unwrap() + map.insert(pat.clone(), re); + map.get(&pat).unwrap() }; result.append(if negate_regex { @@ -2235,10 +2235,10 @@ mod tests { test_utf8!( test_utf8_array_like, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], like_utf8, - vec![true, true, true, false, false, true, false] + vec![true, true, true, false, false, true, false, false] ); test_utf8_scalar!( @@ -2248,6 +2248,23 @@ mod tests { like_utf8_scalar, vec![true, true, false, false] ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex, + vec![".*", "a", "*"], + ".*", + like_utf8_scalar, + vec![true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex_dot, + vec![".", "a", "*"], + ".", + like_utf8_scalar, + vec![true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar, vec!["arrow", "parquet", "datafusion", "flight"], @@ -2316,6 +2333,22 @@ mod tests { nlike_utf8_scalar, vec![false, false, true, true] ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex, + vec![".*", "a", "*"], + ".*", + like_utf8_scalar, + vec![false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex_dot, + vec![".", "a", "*"], + ".", + like_utf8_scalar, + vec![false, true, true] + ); test_utf8_scalar!( test_utf8_array_nlike_scalar, vec!["arrow", "parquet", "datafusion", "flight"], From 2e7297c30c09e07e6e43be774a360ba760bd8950 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Wed, 22 Dec 2021 11:51:10 +0100 Subject: [PATCH 2/4] Fix like regex escaping --- arrow/src/compute/kernels/comparison.rs | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index de3786db2713..b0e605964036 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -360,11 +360,7 @@ pub fn like_utf8_scalar( } } } else { - let re_pattern = right - .replace("%", ".*") - .replace("_", ".") - .replace("(", r#"\("#) - .replace(")", r#"\)"#); + let re_pattern = escape(right).replace("%", ".*").replace("_", "."); let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -440,11 +436,7 @@ pub fn nlike_utf8_scalar( result.append(!left.value(i).ends_with(&right[1..])); } } else { - let re_pattern = right - .replace("%", ".*") - .replace("_", ".") - .replace("(", r#"\("#) - .replace(")", r#"\)"#); + let re_pattern = escape(right).replace("%", ".*").replace("_", "."); let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -525,11 +517,7 @@ pub fn ilike_utf8_scalar( ); } } else { - let re_pattern = right - .replace("%", ".*") - .replace("_", ".") - .replace("(", r#"\("#) - .replace(")", r#"\)"#); + let re_pattern = escape(right).replace("%", ".*").replace("_", "."); let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {}", @@ -2338,7 +2326,7 @@ mod tests { test_utf8_array_nlike_scalar_escape_regex, vec![".*", "a", "*"], ".*", - like_utf8_scalar, + nlike_utf8_scalar, vec![false, true, true] ); @@ -2346,7 +2334,7 @@ mod tests { test_utf8_array_nlike_scalar_escape_regex_dot, vec![".", "a", "*"], ".", - like_utf8_scalar, + nlike_utf8_scalar, vec![false, true, true] ); test_utf8_scalar!( From 976c03aa1bdcda75140c26ed8f35ccb2a6c26550 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Wed, 22 Dec 2021 15:19:03 +0100 Subject: [PATCH 3/4] Fix doctest --- arrow/src/compute/kernels/comparison.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index b0e605964036..8d9f1c340ab1 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -303,7 +303,7 @@ where /// use arrow::compute::like_utf8; /// /// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]); +/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); /// /// let result = like_utf8(&strings, &patterns).unwrap(); /// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); From 595c07b4cd882b1c33f0790e2ea4de6547faa151 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Wed, 22 Dec 2021 16:28:03 +0100 Subject: [PATCH 4/4] Simplify --- arrow/src/compute/kernels/comparison.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 8d9f1c340ab1..a132963c8dbe 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -259,14 +259,14 @@ where let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { let haystack = left.value(i); - let pat = escape(right.value(i)); - let re = if let Some(ref regex) = map.get(&pat) { + let pat = right.value(i); + let re = if let Some(ref regex) = map.get(pat) { regex } else { - let re_pattern = pat.replace("%", ".*").replace("_", "."); + let re_pattern = escape(pat).replace("%", ".*").replace("_", "."); let re = op(&re_pattern)?; - map.insert(pat.clone(), re); - map.get(&pat).unwrap() + map.insert(pat, re); + map.get(pat).unwrap() }; result.append(if negate_regex {