Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions #9841

Merged
merged 17 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 1 addition & 49 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,8 @@ pub enum BuiltinScalarFunction {
EndsWith,
/// initcap
InitCap,
/// left
Left,
/// lpad
Lpad,
/// random
Random,
/// reverse
Reverse,
/// right
Right,
/// rpad
Rpad,
/// strpos
Strpos,
/// substr
Expand Down Expand Up @@ -220,12 +210,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable,
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
BuiltinScalarFunction::InitCap => Volatility::Immutable,
BuiltinScalarFunction::Left => Volatility::Immutable,
BuiltinScalarFunction::Lpad => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
BuiltinScalarFunction::Reverse => Volatility::Immutable,
BuiltinScalarFunction::Right => Volatility::Immutable,
BuiltinScalarFunction::Rpad => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
BuiltinScalarFunction::Translate => Volatility::Immutable,
Expand Down Expand Up @@ -264,17 +249,8 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::InitCap => {
utf8_to_str_type(&input_expr_types[0], "initcap")
}
BuiltinScalarFunction::Left => utf8_to_str_type(&input_expr_types[0], "left"),
BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"),
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
BuiltinScalarFunction::Reverse => {
utf8_to_str_type(&input_expr_types[0], "reverse")
}
BuiltinScalarFunction::Right => {
utf8_to_str_type(&input_expr_types[0], "right")
}
BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"),
BuiltinScalarFunction::EndsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
Expand Down Expand Up @@ -361,28 +337,9 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Coalesce => {
Signature::variadic_equal(self.volatility())
}
BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => {
BuiltinScalarFunction::InitCap => {
Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
}
BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => {
Signature::one_of(
vec![
Exact(vec![Utf8, Int64]),
Exact(vec![LargeUtf8, Int64]),
Exact(vec![Utf8, Int64, Utf8]),
Exact(vec![LargeUtf8, Int64, Utf8]),
Exact(vec![Utf8, Int64, LargeUtf8]),
Exact(vec![LargeUtf8, Int64, LargeUtf8]),
],
self.volatility(),
)
}
BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => {
Signature::one_of(
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
self.volatility(),
)
}

BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => {
Signature::one_of(
Expand Down Expand Up @@ -580,11 +537,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
BuiltinScalarFunction::EndsWith => &["ends_with"],
BuiltinScalarFunction::InitCap => &["initcap"],
BuiltinScalarFunction::Left => &["left"],
BuiltinScalarFunction::Lpad => &["lpad"],
BuiltinScalarFunction::Reverse => &["reverse"],
BuiltinScalarFunction::Right => &["right"],
BuiltinScalarFunction::Rpad => &["rpad"],
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
BuiltinScalarFunction::Substr => &["substr"],
BuiltinScalarFunction::Translate => &["translate"],
Expand Down
21 changes: 0 additions & 21 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,25 +578,11 @@ scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argu
scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");

scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase");
scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`");
scalar_expr!(Reverse, reverse, string, "reverses the `string`");
scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`");
scalar_expr!(Substr, substr, string position, "substring from the `position` to the end");
scalar_expr!(Substr, substring, string position length, "substring from the `position` with `length` characters");
scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`");
//use vec as parameter
nary_scalar_expr!(
Lpad,
lpad,
"fill up a string to the length by prepending the characters"
);
nary_scalar_expr!(
Rpad,
rpad,
"fill up a string to the length by appending the characters"
);
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL");
//there is a func concat_ws before, so use concat_ws_expr as name.c
nary_scalar_expr!(
Expand Down Expand Up @@ -1028,13 +1014,6 @@ mod test {
test_scalar_expr!(Gcd, gcd, arg_1, arg_2);
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
test_scalar_expr!(InitCap, initcap, string);
test_scalar_expr!(Left, left, string, count);
test_nary_scalar_expr!(Lpad, lpad, string, count);
test_nary_scalar_expr!(Lpad, lpad, string, count, characters);
test_scalar_expr!(Reverse, reverse, string);
test_scalar_expr!(Right, right, string, count);
test_nary_scalar_expr!(Rpad, rpad, string, count);
test_nary_scalar_expr!(Rpad, rpad, string, count, characters);
test_scalar_expr!(EndsWith, ends_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
test_scalar_expr!(Substr, substr, string, position);
Expand Down
245 changes: 245 additions & 0 deletions datafusion/functions/src/unicode/left.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::any::Any;
use std::cmp::Ordering;
use std::sync::Arc;

use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::datatypes::DataType;

use datafusion_common::cast::{as_generic_string_array, as_int64_array};
use datafusion_common::exec_err;
use datafusion_common::Result;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};

use crate::utils::{make_scalar_function, utf8_to_str_type};

#[derive(Debug)]
pub(super) struct LeftFunc {
signature: Signature,
}

impl LeftFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
Volatility::Immutable,
),
}
}
}

impl ScalarUDFImpl for LeftFunc {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"left"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
utf8_to_str_type(&arg_types[0], "left")
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(left::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(left::<i64>, vec![])(args),
other => exec_err!("Unsupported data type {other:?} for function left"),
}
}
}

/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
/// left('abcde', 2) = 'ab'
/// The implementation uses UTF-8 code points as characters
pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let n_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
(Some(string), Some(n)) => match n.cmp(&0) {
Ordering::Less => {
let len = string.chars().count() as i64;
Some(if n.abs() < len {
string.chars().take((len + n) as usize).collect::<String>()
} else {
"".to_string()
})
}
Ordering::Equal => Some("".to_string()),
Ordering::Greater => {
Some(string.chars().take(n as usize).collect::<String>())
}
},
_ => None,
})
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

#[cfg(test)]
mod tests {
use arrow::array::{Array, StringArray};
use arrow::datatypes::DataType::Utf8;

use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use crate::unicode::left::LeftFunc;
use crate::utils::test::test_function;

#[test]
fn test_functions() -> Result<()> {
#[cfg(feature = "unicode_expressions")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the whole module is cfg'd we can probably remove these guards on individual tests

test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally not needed as this code is just moved, but I think you can write this more concisely with from if you wnat:

Suggested change
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
ColumnarValue::Scalar(ScalarValue::from("abcde")),
ColumnarValue::Scalar(ScalarValue::from(2u64)),

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(same thing applies to the rest of the tests in this file and and the others)

],
Ok(Some("ab")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(200))),
],
Ok(Some("abcde")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
],
Ok(Some("abc")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-200))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(0))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(None)),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
],
Ok(None),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(None)),
],
Ok(None),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("joséésoj")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(5))),
],
Ok(Some("joséé")),
&str,
Utf8,
StringArray
);
#[cfg(feature = "unicode_expressions")]
test_function!(
LeftFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("joséésoj")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-3))),
],
Ok(Some("joséé")),
&str,
Utf8,
StringArray
);
#[cfg(not(feature = "unicode_expressions"))]
test_function!(
LeftFunc::new90,
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("abcde")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
],
internal_err!(
"function left requires compilation with feature flag: unicode_expressions."
),
&str,
Utf8,
StringArray
);

Ok(())
}
}
Loading
Loading