Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added upper
Browse files Browse the repository at this point in the history
Signed-off-by: Xuanwo <[email protected]>
  • Loading branch information
Xuanwo committed Dec 7, 2021
1 parent 998882e commit fd37721
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 0 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ compute_take = []
compute_temporal = []
compute_window = ["compute_concatenate"]
compute_lower = []
compute_upper = []
compute = [
"compute_aggregate",
"compute_arithmetics",
Expand All @@ -198,6 +199,7 @@ compute = [
"compute_temporal",
"compute_window",
"compute_lower",
"compute_upper"
]
# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
Expand Down
3 changes: 3 additions & 0 deletions src/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ pub mod take;
#[cfg(feature = "compute_temporal")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_temporal")))]
pub mod temporal;
#[cfg(feature = "compute_upper")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_upper")))]
pub mod upper;
mod utils;
#[cfg(feature = "compute_window")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))]
Expand Down
67 changes: 67 additions & 0 deletions src/compute/upper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines kernel to extract a upper case of a \[Large\]StringArray
use super::utils::utf8_apply;
use crate::array::*;
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};

/// Returns a new `Array` where each of each of the elements is upper-cased.
/// this function errors when the passed array is not a \[Large\]String array.
pub fn upper(array: &dyn Array) -> Result<Box<dyn Array>> {
match array.data_type() {
DataType::LargeUtf8 => Ok(Box::new(utf8_apply(
str::to_uppercase,
array
.as_any()
.downcast_ref::<Utf8Array<i64>>()
.expect("A large string is expected"),
))),
DataType::Utf8 => Ok(Box::new(utf8_apply(
str::to_uppercase,
array
.as_any()
.downcast_ref::<Utf8Array<i32>>()
.expect("A string is expected"),
))),
_ => Err(ArrowError::InvalidArgumentError(format!(
"upper does not support type {:?}",
array.data_type()
))),
}
}

/// Checks if an array of type `datatype` can perform upper operation
///
/// # Examples
/// ```
/// use arrow2::compute::upper::can_upper;
/// use arrow2::datatypes::{DataType};
///
/// let data_type = DataType::Utf8;
/// assert_eq!(can_upper(&data_type), true);
///
/// let data_type = DataType::Null;
/// assert_eq!(can_upper(&data_type), false);
/// ```
pub fn can_upper(data_type: &DataType) -> bool {
matches!(data_type, DataType::LargeUtf8 | DataType::Utf8)
}
2 changes: 2 additions & 0 deletions tests/it/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,7 @@ mod substring;
mod take;
#[cfg(feature = "compute_temporal")]
mod temporal;
#[cfg(feature = "compute_upper")]
mod upper;
#[cfg(feature = "compute_window")]
mod window;
186 changes: 186 additions & 0 deletions tests/it/compute/upper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
use arrow2::{array::*, compute::upper::*, error::Result};

fn with_nulls_utf8<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(
vec![Some("hello"), None, Some("world")],
vec![Some("HELLO"), None, Some("WORLD")],
),
// part of input
(
vec![Some("Hello"), None, Some("wOrld")],
vec![Some("HELLO"), None, Some("WORLD")],
),
// all input
(
vec![Some("hello"), None, Some("world")],
vec![Some("HELLO"), None, Some("WORLD")],
),
// UTF8 characters
(
vec![
None,
Some("السلام عليكم"),
Some("Dobrý den"),
Some("שָׁלוֹם"),
Some("नमस्ते"),
Some("こんにちは"),
Some("안녕하세요"),
Some("你好"),
Some("Olá"),
Some("Здравствуйте"),
Some("Hola"),
],
vec![
None,
Some("السلام عليكم"),
Some("DOBRÝ DEN"),
Some("שָׁלוֹם"),
Some("नमस्ते"),
Some("こんにちは"),
Some("안녕하세요"),
Some("你好"),
Some("OLÁ"),
Some("ЗДРАВСТВУЙТЕ"),
Some("HOLA"),
],
),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = Utf8Array::<O>::from(&array);
let result = upper(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
let expected = Utf8Array::<O>::from(&expected);

assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn with_nulls_string() -> Result<()> {
with_nulls_utf8::<i32>()
}

#[test]
fn with_nulls_large_string() -> Result<()> {
with_nulls_utf8::<i64>()
}

fn without_nulls_utf8<O: Offset>() -> Result<()> {
let cases = vec![
// identity
(vec!["hello", "world"], vec!["HELLO", "WORLD"]),
// part of input
(vec!["Hello", "wOrld"], vec!["HELLO", "WORLD"]),
// all input
(vec!["HELLO", "WORLD"], vec!["HELLO", "WORLD"]),
// UTF8 characters
(
vec![
"السلام عليكم",
"Dobrý den",
"שָׁלוֹם",
"नमस्ते",
"こんにちは",
"안녕하세요",
"你好",
"Olá",
"Здравствуйте",
"Hola",
],
vec![
"السلام عليكم",
"DOBRÝ DEN",
"שָׁלוֹם",
"नमस्ते",
"こんにちは",
"안녕하세요",
"你好",
"OLÁ",
"ЗДРАВСТВУЙТЕ",
"HOLA",
],
),
];

cases
.into_iter()
.try_for_each::<_, Result<()>>(|(array, expected)| {
let array = Utf8Array::<O>::from_slice(&array);
let result = upper(&array)?;
assert_eq!(array.len(), result.len());

let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
let expected = Utf8Array::<O>::from_slice(&expected);
assert_eq!(&expected, result);
Ok(())
})?;

Ok(())
}

#[test]
fn without_nulls_string() -> Result<()> {
without_nulls_utf8::<i32>()
}

#[test]
fn without_nulls_large_string() -> Result<()> {
without_nulls_utf8::<i64>()
}

#[test]
fn consistency() {
use arrow2::datatypes::DataType::*;
use arrow2::datatypes::TimeUnit;
let datatypes = vec![
Null,
Boolean,
UInt8,
UInt16,
UInt32,
UInt64,
Int8,
Int16,
Int32,
Int64,
Float32,
Float64,
Timestamp(TimeUnit::Second, None),
Timestamp(TimeUnit::Millisecond, None),
Timestamp(TimeUnit::Microsecond, None),
Timestamp(TimeUnit::Nanosecond, None),
Time64(TimeUnit::Microsecond),
Time64(TimeUnit::Nanosecond),
Date32,
Time32(TimeUnit::Second),
Time32(TimeUnit::Millisecond),
Date64,
Utf8,
LargeUtf8,
Binary,
LargeBinary,
Duration(TimeUnit::Second),
Duration(TimeUnit::Millisecond),
Duration(TimeUnit::Microsecond),
Duration(TimeUnit::Nanosecond),
];

datatypes.into_iter().for_each(|d1| {
let array = new_null_array(d1.clone(), 10);
if can_upper(&d1) {
assert!(upper(array.as_ref()).is_ok());
} else {
assert!(upper(array.as_ref()).is_err());
}
});
}

0 comments on commit fd37721

Please sign in to comment.