Skip to content

Commit

Permalink
Faster reverse() string function for ASCII-only case (#14195)
Browse files Browse the repository at this point in the history
* Faster reverse() string function for ASCII-only case

* add byte_buf
  • Loading branch information
UBarney authored Jan 22, 2025
1 parent 3efcd6a commit 2ac20e3
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 96 deletions.
56 changes: 2 additions & 54 deletions datafusion/functions/benches/character_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,62 +17,10 @@

extern crate criterion;

use arrow::array::{StringArray, StringViewArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::ColumnarValue;
use rand::distributions::Alphanumeric;
use rand::{rngs::StdRng, Rng, SeedableRng};
use std::sync::Arc;
use helper::gen_string_array;

/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
/// 4096 rows, each row containing a string with 128 random characters.
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
fn gen_string_array(
n_rows: usize,
str_len_chars: usize,
null_density: f32,
utf8_density: f32,
is_string_view: bool, // false -> StringArray, true -> StringViewArray
) -> Vec<ColumnarValue> {
let mut rng = StdRng::seed_from_u64(42);
let rng_ref = &mut rng;

let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
let corpus_char_count = corpus.chars().count();

let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
for _ in 0..n_rows {
let rand_num = rng_ref.gen::<f32>(); // [0.0, 1.0)
if rand_num < null_density {
output_string_vec.push(None);
} else if rand_num < null_density + utf8_density {
// Generate random UTF8 string
let mut generated_string = String::with_capacity(str_len_chars);
for _ in 0..str_len_chars {
let idx = rng_ref.gen_range(0..corpus_char_count);
let char = corpus.chars().nth(idx).unwrap();
generated_string.push(char);
}
output_string_vec.push(Some(generated_string));
} else {
// Generate random ASCII-only string
let value = rng_ref
.sample_iter(&Alphanumeric)
.take(str_len_chars)
.collect();
let value = String::from_utf8(value).unwrap();
output_string_vec.push(Some(value));
}
}

if is_string_view {
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
vec![ColumnarValue::Array(Arc::new(string_view_array))]
} else {
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
vec![ColumnarValue::Array(Arc::new(string_array))]
}
}
mod helper;

fn criterion_benchmark(c: &mut Criterion) {
// All benches are single batch run with 8192 rows
Expand Down
72 changes: 72 additions & 0 deletions datafusion/functions/benches/helper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{StringArray, StringViewArray};
use datafusion_expr::ColumnarValue;
use rand::distributions::Alphanumeric;
use rand::{rngs::StdRng, Rng, SeedableRng};
use std::sync::Arc;

/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
/// 4096 rows, each row containing a string with 128 random characters.
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
pub fn gen_string_array(
n_rows: usize,
str_len_chars: usize,
null_density: f32,
utf8_density: f32,
is_string_view: bool, // false -> StringArray, true -> StringViewArray
) -> Vec<ColumnarValue> {
let mut rng = StdRng::seed_from_u64(42);
let rng_ref = &mut rng;

let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
let corpus_char_count = corpus.chars().count();

let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
for _ in 0..n_rows {
let rand_num = rng_ref.gen::<f32>(); // [0.0, 1.0)
if rand_num < null_density {
output_string_vec.push(None);
} else if rand_num < null_density + utf8_density {
// Generate random UTF8 string
let mut generated_string = String::with_capacity(str_len_chars);
for _ in 0..str_len_chars {
let idx = rng_ref.gen_range(0..corpus_char_count);
let char = corpus.chars().nth(idx).unwrap();
generated_string.push(char);
}
output_string_vec.push(Some(generated_string));
} else {
// Generate random ASCII-only string
let value = rng_ref
.sample_iter(&Alphanumeric)
.take(str_len_chars)
.collect();
let value = String::from_utf8(value).unwrap();
output_string_vec.push(Some(value));
}
}

if is_string_view {
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
vec![ColumnarValue::Array(Arc::new(string_view_array))]
} else {
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
vec![ColumnarValue::Array(Arc::new(string_array))]
}
}
90 changes: 52 additions & 38 deletions datafusion/functions/benches/reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,70 +16,84 @@
// under the License.

extern crate criterion;
mod helper;

use arrow::array::OffsetSizeTrait;
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::ColumnarValue;
use datafusion_functions::unicode;
use std::sync::Arc;

fn create_args<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
force_view_types: bool,
) -> Vec<ColumnarValue> {
if force_view_types {
let string_array =
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));

vec![ColumnarValue::Array(string_array)]
} else {
let string_array =
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));

vec![ColumnarValue::Array(string_array)]
}
}
use helper::gen_string_array;

fn criterion_benchmark(c: &mut Criterion) {
let reverse = unicode::reverse();
for size in [1024, 4096] {
let str_len = 8;
// All benches are single batch run with 8192 rows
let reverse = datafusion_functions::unicode::reverse();

let args = create_args::<i32>(size, str_len, true);
const N_ROWS: usize = 8192;
const NULL_DENSITY: f32 = 0.1;
const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0;
const NORMAL_UTF8_DENSITY: f32 = 0.8;
for str_len in [8, 32, 128, 4096] {
// StringArray ASCII only
let args_string_ascii = gen_string_array(
N_ROWS,
str_len,
NULL_DENSITY,
UTF8_DENSITY_OF_ALL_ASCII,
false,
);
c.bench_function(
format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(),
&format!("reverse_StringArray_ascii_str_len_{}", str_len),
|b| {
b.iter(|| {
// TODO use invoke_with_args
black_box(reverse.invoke_batch(&args, str_len))
black_box(reverse.invoke_batch(&args_string_ascii, N_ROWS))
})
},
);

let str_len = 32;
// StringArray UTF8
let args_string_utf8 =
gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, false);
c.bench_function(
&format!(
"reverse_StringArray_utf8_density_{}_str_len_{}",
NORMAL_UTF8_DENSITY, str_len
),
|b| {
b.iter(|| {
// TODO use invoke_with_args
black_box(reverse.invoke_batch(&args_string_utf8, N_ROWS))
})
},
);

let args = create_args::<i32>(size, str_len, true);
// StringViewArray ASCII only
let args_string_view_ascii = gen_string_array(
N_ROWS,
str_len,
NULL_DENSITY,
UTF8_DENSITY_OF_ALL_ASCII,
true,
);
c.bench_function(
format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(),
&format!("reverse_StringViewArray_ascii_str_len_{}", str_len),
|b| {
b.iter(|| {
// TODO use invoke_with_args
black_box(reverse.invoke_batch(&args, str_len))
black_box(reverse.invoke_batch(&args_string_view_ascii, N_ROWS))
})
},
);

let args = create_args::<i32>(size, str_len, false);
// StringViewArray UTF8
let args_string_view_utf8 =
gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, true);
c.bench_function(
format!("reverse_string [size={}, str_len={}]", size, str_len).as_str(),
&format!(
"reverse_StringViewArray_utf8_density_{}_str_len_{}",
NORMAL_UTF8_DENSITY, str_len
),
|b| {
b.iter(|| {
// TODO use invoke_with_args
black_box(reverse.invoke_batch(&args, str_len))
black_box(reverse.invoke_batch(&args_string_view_utf8, N_ROWS))
})
},
);
Expand Down
19 changes: 15 additions & 4 deletions datafusion/functions/src/unicode/reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,23 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
) -> Result<ArrayRef> {
let mut builder = GenericStringBuilder::<T>::with_capacity(string_array.len(), 1024);

let mut reversed = String::new();
let mut string_buf = String::new();
let mut byte_buf = Vec::<u8>::new();
for string in string_array.iter() {
if let Some(s) = string {
reversed.extend(s.chars().rev());
builder.append_value(&reversed);
reversed.clear();
if s.is_ascii() {
// reverse bytes directly since ASCII characters are single bytes
byte_buf.extend(s.as_bytes());
byte_buf.reverse();
// SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8.
let reversed = unsafe { std::str::from_utf8_unchecked(&byte_buf) };
builder.append_value(reversed);
byte_buf.clear();
} else {
string_buf.extend(s.chars().rev());
builder.append_value(&string_buf);
string_buf.clear();
}
} else {
builder.append_null();
}
Expand Down

0 comments on commit 2ac20e3

Please sign in to comment.