From d900a35e38f2c159427fa1efb80143733b899cd2 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Wed, 6 Dec 2023 12:35:56 +0100 Subject: [PATCH 1/4] refactor trim --- .../physical-expr/src/string_expressions.rs | 149 ++++++------------ 1 file changed, 50 insertions(+), 99 deletions(-) diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 91d21f95e41f..08691bb25e88 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -133,53 +133,6 @@ pub fn ascii(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } -/// Removes the longest string containing only characters in characters (a space by default) from the start and end of string. -/// btrim('xyxtrimyyx', 'xyz') = 'trim' -pub fn btrim(args: &[ArrayRef]) -> Result { - match args.len() { - 1 => { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| { - string.map(|string: &str| { - string.trim_start_matches(' ').trim_end_matches(' ') - }) - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) - } - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .map(|(string, characters)| match (string, characters) { - (None, _) => None, - (_, None) => None, - (Some(string), Some(characters)) => { - let chars: Vec = characters.chars().collect(); - Some( - string - .trim_start_matches(&chars[..]) - .trim_end_matches(&chars[..]), - ) - } - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) - } - other => internal_err!( - "btrim was called with {other} arguments. It requires at least 1 and at most 2." - ), - } -} - /// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. /// chr(65) = 'A' pub fn chr(args: &[ArrayRef]) -> Result { @@ -346,44 +299,80 @@ pub fn lower(args: &[ColumnarValue]) -> Result { handle(args, |string| string.to_ascii_lowercase(), "lower") } -/// Removes the longest string containing only characters in characters (a space by default) from the start of string. -/// ltrim('zzzytest', 'xyz') = 'test' -pub fn ltrim(args: &[ArrayRef]) -> Result { +enum TrimType { + Left, + Right, + Both, +} + +fn general_trim( + args: &[ArrayRef], + trim_type: TrimType, +) -> Result { + let func = match trim_type { + TrimType::Left => str::trim_start_matches::<&str>, + TrimType::Right => str::trim_end_matches::<&str>, + TrimType::Both => |input, pattern| { + str::trim_end_matches(str::trim_start_matches(input, pattern), pattern) + }, + }; + + let string_array = as_generic_string_array::(&args[0])?; + match args.len() { 1 => { - let string_array = as_generic_string_array::(&args[0])?; - let result = string_array .iter() - .map(|string| string.map(|string: &str| string.trim_start_matches(' '))) + .map(|string| string.map(|string: &str| func(string, " "))) .collect::>(); Ok(Arc::new(result) as ArrayRef) } 2 => { - let string_array = as_generic_string_array::(&args[0])?; let characters_array = as_generic_string_array::(&args[1])?; let result = string_array .iter() .zip(characters_array.iter()) .map(|(string, characters)| match (string, characters) { - (Some(string), Some(characters)) => { - let chars: Vec = characters.chars().collect(); - Some(string.trim_start_matches(&chars[..])) - } + (Some(string), Some(characters)) => Some(func(string, characters)), _ => None, }) .collect::>(); Ok(Arc::new(result) as ArrayRef) } - other => internal_err!( - "ltrim was called with {other} arguments. It requires at least 1 and at most 2." - ), + other => { + let trim = match trim_type { + TrimType::Left => "ltrim", + TrimType::Right => "rtrim", + TrimType::Both => "btrim", + }; + internal_err!( + "{trim} was called with {other} arguments. It requires at least 1 and at most 2." + ) + } } } +/// Removes the longest string containing only characters in characters (a space by default) from the start and end of string. +/// btrim('xyxtrimyyx', 'xyz') = 'trim' +pub fn btrim(args: &[ArrayRef]) -> Result { + general_trim::(args, TrimType::Both) +} + +/// Removes the longest string containing only characters in characters (a space by default) from the start of string. +/// ltrim('zzzytest', 'xyz') = 'test' +pub fn ltrim(args: &[ArrayRef]) -> Result { + general_trim::(args, TrimType::Left) +} + +/// Removes the longest string containing only characters in characters (a space by default) from the end of string. +/// rtrim('testxxzx', 'xyz') = 'test' +pub fn rtrim(args: &[ArrayRef]) -> Result { + general_trim::(args, TrimType::Right) +} + /// Repeats string the specified number of times. /// repeat('Pg', 4) = 'PgPgPgPg' pub fn repeat(args: &[ArrayRef]) -> Result { @@ -422,44 +411,6 @@ pub fn replace(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } -/// Removes the longest string containing only characters in characters (a space by default) from the end of string. -/// rtrim('testxxzx', 'xyz') = 'test' -pub fn rtrim(args: &[ArrayRef]) -> Result { - match args.len() { - 1 => { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| string.map(|string: &str| string.trim_end_matches(' '))) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) - } - 2 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .map(|(string, characters)| match (string, characters) { - (Some(string), Some(characters)) => { - let chars: Vec = characters.chars().collect(); - Some(string.trim_end_matches(&chars[..])) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) - } - other => internal_err!( - "rtrim was called with {other} arguments. It requires at least 1 and at most 2." - ), - } -} - /// Splits string at occurrences of delimiter and returns the n'th field (counting from one). /// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def' pub fn split_part(args: &[ArrayRef]) -> Result { From 0a8a3f8d4e881d012fba472fd7245711906efc5a Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Wed, 6 Dec 2023 12:42:52 +0100 Subject: [PATCH 2/4] add fmt for TrimType --- .../physical-expr/src/string_expressions.rs | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 08691bb25e88..9a238149411e 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -37,8 +37,11 @@ use datafusion_common::{ }; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_expr::ColumnarValue; -use std::iter; use std::sync::Arc; +use std::{ + fmt::{Display, Formatter}, + iter, +}; use uuid::Uuid; /// applies a unary expression to `args[0]` that is expected to be downcastable to @@ -305,6 +308,16 @@ enum TrimType { Both, } +impl Display for TrimType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + TrimType::Left => write!(f, "ltrim"), + TrimType::Right => write!(f, "rtrim"), + TrimType::Both => write!(f, "btrim"), + } + } +} + fn general_trim( args: &[ArrayRef], trim_type: TrimType, @@ -343,13 +356,8 @@ fn general_trim( Ok(Arc::new(result) as ArrayRef) } other => { - let trim = match trim_type { - TrimType::Left => "ltrim", - TrimType::Right => "rtrim", - TrimType::Both => "btrim", - }; internal_err!( - "{trim} was called with {other} arguments. It requires at least 1 and at most 2." + "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." ) } } From ec71f3bfd3965e6ec694c6d5c090b301c2f3acd3 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Thu, 7 Dec 2023 11:31:21 +0100 Subject: [PATCH 3/4] fix closure --- .../physical-expr/src/string_expressions.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 9a238149411e..406fa120e86c 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -323,10 +323,20 @@ fn general_trim( trim_type: TrimType, ) -> Result { let func = match trim_type { - TrimType::Left => str::trim_start_matches::<&str>, - TrimType::Right => str::trim_end_matches::<&str>, - TrimType::Both => |input, pattern| { - str::trim_end_matches(str::trim_start_matches(input, pattern), pattern) + TrimType::Left => |input, pattern: &str| { + let pattern = pattern.chars().collect::>(); + str::trim_start_matches::<&[char]>(input, pattern.as_ref()) + }, + TrimType::Right => |input, pattern: &str| { + let pattern = pattern.chars().collect::>(); + str::trim_end_matches::<&[char]>(input, pattern.as_ref()) + }, + TrimType::Both => |input, pattern: &str| { + let pattern = pattern.chars().collect::>(); + str::trim_end_matches::<&[char]>( + str::trim_start_matches::<&[char]>(input, pattern.as_ref()), + pattern.as_ref(), + ) }, }; From ec7d7accbe7fb6f5295b0390a7e089ea411eb7d4 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Fri, 8 Dec 2023 21:49:52 +0100 Subject: [PATCH 4/4] update comment --- datafusion/physical-expr/src/string_expressions.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 406fa120e86c..7d9fecf61407 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -373,19 +373,19 @@ fn general_trim( } } -/// Removes the longest string containing only characters in characters (a space by default) from the start and end of string. +/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' pub fn btrim(args: &[ArrayRef]) -> Result { general_trim::(args, TrimType::Both) } -/// Removes the longest string containing only characters in characters (a space by default) from the start of string. +/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' pub fn ltrim(args: &[ArrayRef]) -> Result { general_trim::(args, TrimType::Left) } -/// Removes the longest string containing only characters in characters (a space by default) from the end of string. +/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' pub fn rtrim(args: &[ArrayRef]) -> Result { general_trim::(args, TrimType::Right)