From 320e4d63424571b25e40682ab6e127b8e4509f6f Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 05:40:22 +0700 Subject: [PATCH] Optimize performance of `initcap` function (~2x faster) (#13691) * Optimize performance of initcap (~2x faster) Signed-off-by: Tai Le Manh * format --------- Signed-off-by: Tai Le Manh --- datafusion/functions/Cargo.toml | 5 ++ datafusion/functions/benches/initcap.rs | 93 ++++++++++++++++++++++ datafusion/functions/src/string/initcap.rs | 27 ++++--- 3 files changed, 112 insertions(+), 13 deletions(-) create mode 100644 datafusion/functions/benches/initcap.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 36d4af9ab55b..575e8484a92f 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -207,3 +207,8 @@ required-features = ["unicode_expressions"] harness = false name = "trunc" required-features = ["math_expressions"] + +[[bench]] +harness = false +name = "initcap" +required-features = ["string_expressions"] diff --git a/datafusion/functions/benches/initcap.rs b/datafusion/functions/benches/initcap.rs new file mode 100644 index 000000000000..c88b6b513980 --- /dev/null +++ b/datafusion/functions/benches/initcap.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::OffsetSizeTrait; +use arrow::datatypes::DataType; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::string; +use std::sync::Arc; + +fn create_args( + size: usize, + str_len: usize, + force_view_types: bool, +) -> Vec { + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.2, str_len, false)); + + vec![ColumnarValue::Array(string_array)] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.2, str_len)); + + vec![ColumnarValue::Array(string_array)] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let initcap = string::initcap(); + for size in [1024, 4096] { + let args = create_args::(size, 8, true); + c.bench_function( + format!("initcap string view shorter than 12 [size={}]", size).as_str(), + |b| { + b.iter(|| { + black_box(initcap.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + number_rows: size, + return_type: &DataType::Utf8View, + })) + }) + }, + ); + + let args = create_args::(size, 16, true); + c.bench_function( + format!("initcap string view longer than 12 [size={}]", size).as_str(), + |b| { + b.iter(|| { + black_box(initcap.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + number_rows: size, + return_type: &DataType::Utf8View, + })) + }) + }, + ); + + let args = create_args::(size, 16, false); + c.bench_function(format!("initcap string [size={}]", size).as_str(), |b| { + b.iter(|| { + black_box(initcap.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + number_rows: size, + return_type: &DataType::Utf8, + })) + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index 338a89091d29..4ca0d6b6499a 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -132,21 +132,22 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } -fn initcap_string(string: Option<&str>) -> Option { - let mut char_vector = Vec::::new(); - string.map(|string: &str| { - char_vector.clear(); - let mut previous_character_letter_or_number = false; - for c in string.chars() { - if previous_character_letter_or_number { - char_vector.push(c.to_ascii_lowercase()); +fn initcap_string(input: Option<&str>) -> Option { + input.map(|s| { + let mut result = String::with_capacity(s.len()); + let mut prev_is_alphanumeric = false; + + for c in s.chars() { + let transformed = if prev_is_alphanumeric { + c.to_ascii_lowercase() } else { - char_vector.push(c.to_ascii_uppercase()); - } - previous_character_letter_or_number = - c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit(); + c.to_ascii_uppercase() + }; + result.push(transformed); + prev_is_alphanumeric = c.is_ascii_alphanumeric(); } - char_vector.iter().collect::() + + result }) }