diff --git a/datafusion/functions/benches/character_length.rs b/datafusion/functions/benches/character_length.rs index b3fdb8dc8561..3655d8409807 100644 --- a/datafusion/functions/benches/character_length.rs +++ b/datafusion/functions/benches/character_length.rs @@ -17,62 +17,10 @@ extern crate criterion; -use arrow::array::{StringArray, StringViewArray}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::ColumnarValue; -use rand::distributions::Alphanumeric; -use rand::{rngs::StdRng, Rng, SeedableRng}; -use std::sync::Arc; +use helper::gen_string_array; -/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with -/// 4096 rows, each row containing a string with 128 random characters. -/// around 10% of the rows are null, around 10% of the rows are non-ASCII. -fn gen_string_array( - n_rows: usize, - str_len_chars: usize, - null_density: f32, - utf8_density: f32, - is_string_view: bool, // false -> StringArray, true -> StringViewArray -) -> Vec { - let mut rng = StdRng::seed_from_u64(42); - let rng_ref = &mut rng; - - let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes - let corpus_char_count = corpus.chars().count(); - - let mut output_string_vec: Vec> = Vec::with_capacity(n_rows); - for _ in 0..n_rows { - let rand_num = rng_ref.gen::(); // [0.0, 1.0) - if rand_num < null_density { - output_string_vec.push(None); - } else if rand_num < null_density + utf8_density { - // Generate random UTF8 string - let mut generated_string = String::with_capacity(str_len_chars); - for _ in 0..str_len_chars { - let idx = rng_ref.gen_range(0..corpus_char_count); - let char = corpus.chars().nth(idx).unwrap(); - generated_string.push(char); - } - output_string_vec.push(Some(generated_string)); - } else { - // Generate random ASCII-only string - let value = rng_ref - .sample_iter(&Alphanumeric) - .take(str_len_chars) - .collect(); - let value = String::from_utf8(value).unwrap(); - output_string_vec.push(Some(value)); - } - } - - if is_string_view { - let string_view_array: StringViewArray = output_string_vec.into_iter().collect(); - vec![ColumnarValue::Array(Arc::new(string_view_array))] - } else { - let string_array: StringArray = output_string_vec.clone().into_iter().collect(); - vec![ColumnarValue::Array(Arc::new(string_array))] - } -} +mod helper; fn criterion_benchmark(c: &mut Criterion) { // All benches are single batch run with 8192 rows diff --git a/datafusion/functions/benches/helper.rs b/datafusion/functions/benches/helper.rs new file mode 100644 index 000000000000..c7c405bc4696 --- /dev/null +++ b/datafusion/functions/benches/helper.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{StringArray, StringViewArray}; +use datafusion_expr::ColumnarValue; +use rand::distributions::Alphanumeric; +use rand::{rngs::StdRng, Rng, SeedableRng}; +use std::sync::Arc; + +/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with +/// 4096 rows, each row containing a string with 128 random characters. +/// around 10% of the rows are null, around 10% of the rows are non-ASCII. +pub fn gen_string_array( + n_rows: usize, + str_len_chars: usize, + null_density: f32, + utf8_density: f32, + is_string_view: bool, // false -> StringArray, true -> StringViewArray +) -> Vec { + let mut rng = StdRng::seed_from_u64(42); + let rng_ref = &mut rng; + + let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes + let corpus_char_count = corpus.chars().count(); + + let mut output_string_vec: Vec> = Vec::with_capacity(n_rows); + for _ in 0..n_rows { + let rand_num = rng_ref.gen::(); // [0.0, 1.0) + if rand_num < null_density { + output_string_vec.push(None); + } else if rand_num < null_density + utf8_density { + // Generate random UTF8 string + let mut generated_string = String::with_capacity(str_len_chars); + for _ in 0..str_len_chars { + let idx = rng_ref.gen_range(0..corpus_char_count); + let char = corpus.chars().nth(idx).unwrap(); + generated_string.push(char); + } + output_string_vec.push(Some(generated_string)); + } else { + // Generate random ASCII-only string + let value = rng_ref + .sample_iter(&Alphanumeric) + .take(str_len_chars) + .collect(); + let value = String::from_utf8(value).unwrap(); + output_string_vec.push(Some(value)); + } + } + + if is_string_view { + let string_view_array: StringViewArray = output_string_vec.into_iter().collect(); + vec![ColumnarValue::Array(Arc::new(string_view_array))] + } else { + let string_array: StringArray = output_string_vec.clone().into_iter().collect(); + vec![ColumnarValue::Array(Arc::new(string_array))] + } +} diff --git a/datafusion/functions/benches/reverse.rs b/datafusion/functions/benches/reverse.rs index c7c1ef8a8220..889ca59e2a14 100644 --- a/datafusion/functions/benches/reverse.rs +++ b/datafusion/functions/benches/reverse.rs @@ -16,70 +16,84 @@ // under the License. extern crate criterion; +mod helper; -use arrow::array::OffsetSizeTrait; -use arrow::util::bench_util::{ - create_string_array_with_len, create_string_view_array_with_len, -}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::ColumnarValue; -use datafusion_functions::unicode; -use std::sync::Arc; - -fn create_args( - size: usize, - str_len: usize, - force_view_types: bool, -) -> Vec { - if force_view_types { - let string_array = - Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); - - vec![ColumnarValue::Array(string_array)] - } else { - let string_array = - Arc::new(create_string_array_with_len::(size, 0.1, str_len)); - - vec![ColumnarValue::Array(string_array)] - } -} +use helper::gen_string_array; fn criterion_benchmark(c: &mut Criterion) { - let reverse = unicode::reverse(); - for size in [1024, 4096] { - let str_len = 8; + // All benches are single batch run with 8192 rows + let reverse = datafusion_functions::unicode::reverse(); - let args = create_args::(size, str_len, true); + const N_ROWS: usize = 8192; + const NULL_DENSITY: f32 = 0.1; + const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0; + const NORMAL_UTF8_DENSITY: f32 = 0.8; + for str_len in [8, 32, 128, 4096] { + // StringArray ASCII only + let args_string_ascii = gen_string_array( + N_ROWS, + str_len, + NULL_DENSITY, + UTF8_DENSITY_OF_ALL_ASCII, + false, + ); c.bench_function( - format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + &format!("reverse_StringArray_ascii_str_len_{}", str_len), |b| { b.iter(|| { // TODO use invoke_with_args - black_box(reverse.invoke_batch(&args, str_len)) + black_box(reverse.invoke_batch(&args_string_ascii, N_ROWS)) }) }, ); - let str_len = 32; + // StringArray UTF8 + let args_string_utf8 = + gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, false); + c.bench_function( + &format!( + "reverse_StringArray_utf8_density_{}_str_len_{}", + NORMAL_UTF8_DENSITY, str_len + ), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args_string_utf8, N_ROWS)) + }) + }, + ); - let args = create_args::(size, str_len, true); + // StringViewArray ASCII only + let args_string_view_ascii = gen_string_array( + N_ROWS, + str_len, + NULL_DENSITY, + UTF8_DENSITY_OF_ALL_ASCII, + true, + ); c.bench_function( - format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + &format!("reverse_StringViewArray_ascii_str_len_{}", str_len), |b| { b.iter(|| { // TODO use invoke_with_args - black_box(reverse.invoke_batch(&args, str_len)) + black_box(reverse.invoke_batch(&args_string_view_ascii, N_ROWS)) }) }, ); - let args = create_args::(size, str_len, false); + // StringViewArray UTF8 + let args_string_view_utf8 = + gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, true); c.bench_function( - format!("reverse_string [size={}, str_len={}]", size, str_len).as_str(), + &format!( + "reverse_StringViewArray_utf8_density_{}_str_len_{}", + NORMAL_UTF8_DENSITY, str_len + ), |b| { b.iter(|| { // TODO use invoke_with_args - black_box(reverse.invoke_batch(&args, str_len)) + black_box(reverse.invoke_batch(&args_string_view_utf8, N_ROWS)) }) }, ); diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index f07deda70e52..c941fe32c13b 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -119,12 +119,23 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( ) -> Result { let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), 1024); - let mut reversed = String::new(); + let mut string_buf = String::new(); + let mut byte_buf = Vec::::new(); for string in string_array.iter() { if let Some(s) = string { - reversed.extend(s.chars().rev()); - builder.append_value(&reversed); - reversed.clear(); + if s.is_ascii() { + // reverse bytes directly since ASCII characters are single bytes + byte_buf.extend(s.as_bytes()); + byte_buf.reverse(); + // SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8. + let reversed = unsafe { std::str::from_utf8_unchecked(&byte_buf) }; + builder.append_value(reversed); + byte_buf.clear(); + } else { + string_buf.extend(s.chars().rev()); + builder.append_value(&string_buf); + string_buf.clear(); + } } else { builder.append_null(); }