Skip to content

Commit

Permalink
feat: Add Time/Interval/Decimal/Utf8View in aggregate fuzz te…
Browse files Browse the repository at this point in the history
…sting (#13226)

* support Time/Interval/Decimal types in data generator.

* introduce RandomNativeData trait.

* fix bug.

* support utf8view type in data generator.

* fix clippy.

* fix bug.
  • Loading branch information
LeslieKid authored Nov 5, 2024
1 parent cc43766 commit f2344d2
Show file tree
Hide file tree
Showing 8 changed files with 433 additions and 93 deletions.
47 changes: 42 additions & 5 deletions datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ use arrow::datatypes::DataType;
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::pretty_format_batches;
use arrow_array::types::Int64Type;
use arrow_schema::{
IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
};
use datafusion::common::Result;
use datafusion::datasource::MemTable;
use datafusion::physical_expr::aggregate::AggregateExprBuilder;
Expand All @@ -45,7 +49,7 @@ use crate::fuzz_cases::aggregation_fuzzer::{
use datafusion_common::HashMap;
use datafusion_physical_expr_common::sort_expr::LexOrdering;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand::{thread_rng, Rng, SeedableRng};
use tokio::task::JoinSet;

// ========================================================================
Expand Down Expand Up @@ -151,6 +155,7 @@ async fn test_count() {
/// 1. Floating point numbers
/// 1. structured types
fn baseline_config() -> DatasetGeneratorConfig {
let mut rng = thread_rng();
let columns = vec![
ColumnDescr::new("i8", DataType::Int8),
ColumnDescr::new("i16", DataType::Int16),
Expand All @@ -162,13 +167,45 @@ fn baseline_config() -> DatasetGeneratorConfig {
ColumnDescr::new("u64", DataType::UInt64),
ColumnDescr::new("date32", DataType::Date32),
ColumnDescr::new("date64", DataType::Date64),
// TODO: date/time columns
// todo decimal columns
ColumnDescr::new("time32_s", DataType::Time32(TimeUnit::Second)),
ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)),
ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)),
ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)),
ColumnDescr::new(
"interval_year_month",
DataType::Interval(IntervalUnit::YearMonth),
),
ColumnDescr::new(
"interval_day_time",
DataType::Interval(IntervalUnit::DayTime),
),
ColumnDescr::new(
"interval_month_day_nano",
DataType::Interval(IntervalUnit::MonthDayNano),
),
// begin decimal columns
ColumnDescr::new("decimal128", {
// Generate valid precision and scale for Decimal128 randomly.
let precision: u8 = rng.gen_range(1..=DECIMAL128_MAX_PRECISION);
// It's safe to cast `precision` to i8 type directly.
let scale: i8 = rng.gen_range(
i8::MIN..=std::cmp::min(precision as i8, DECIMAL128_MAX_SCALE),
);
DataType::Decimal128(precision, scale)
}),
ColumnDescr::new("decimal256", {
// Generate valid precision and scale for Decimal256 randomly.
let precision: u8 = rng.gen_range(1..=DECIMAL256_MAX_PRECISION);
// It's safe to cast `precision` to i8 type directly.
let scale: i8 = rng.gen_range(
i8::MIN..=std::cmp::min(precision as i8, DECIMAL256_MAX_SCALE),
);
DataType::Decimal256(precision, scale)
}),
// begin string columns
ColumnDescr::new("utf8", DataType::Utf8),
ColumnDescr::new("largeutf8", DataType::LargeUtf8),
// TODO add support for utf8view in data generator
// ColumnDescr::new("utf8view", DataType::Utf8View),
ColumnDescr::new("utf8view", DataType::Utf8View),
// todo binary
// low cardinality columns
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
Expand Down
173 changes: 151 additions & 22 deletions datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
use std::sync::Arc;

use arrow::datatypes::{
Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type,
StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
};
use arrow_array::{ArrayRef, RecordBatch};
use arrow_schema::{DataType, Field, Schema};
use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
use datafusion_common::{arrow_datafusion_err, DataFusionError, Result};
use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
use datafusion_physical_expr_common::sort_expr::LexOrdering;
Expand All @@ -32,7 +35,7 @@ use rand::{
thread_rng, Rng, SeedableRng,
};
use test_utils::{
array_gen::{PrimitiveArrayGenerator, StringArrayGenerator},
array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator},
stagger_batch,
};

Expand Down Expand Up @@ -219,7 +222,7 @@ struct RecordBatchGenerator {
}

macro_rules! generate_string_array {
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
Expand All @@ -232,25 +235,47 @@ macro_rules! generate_string_array {
rng: $ARRAY_GEN_RNG,
};

generator.gen_data::<$OFFSET_TYPE>()
match $ARROW_TYPE::DATA_TYPE {
DataType::Utf8 => generator.gen_data::<i32>(),
DataType::LargeUtf8 => generator.gen_data::<i64>(),
DataType::Utf8View => generator.gen_string_view(),
_ => unreachable!(),
}
}};
}

macro_rules! generate_decimal_array {
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT: expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $PRECISION: ident, $SCALE: ident, $ARROW_TYPE: ident) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];

let mut generator = DecimalArrayGenerator {
precision: $PRECISION,
scale: $SCALE,
num_decimals: $NUM_ROWS,
num_distinct_decimals: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};

generator.gen_data::<$ARROW_TYPE>()
}};
}

macro_rules! generate_primitive_array {
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
paste::paste! {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];

let mut generator = PrimitiveArrayGenerator {
num_primitives: $NUM_ROWS,
num_distinct_primitives: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};

generator.gen_data::<$ARROW_TYPE>()
}}}
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];

let mut generator = PrimitiveArrayGenerator {
num_primitives: $NUM_ROWS,
num_distinct_primitives: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};

generator.gen_data::<$ARROW_TYPE>()
}};
}

impl RecordBatchGenerator {
Expand Down Expand Up @@ -432,14 +457,108 @@ impl RecordBatchGenerator {
Date64Type
)
}
DataType::Time32(TimeUnit::Second) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Time32SecondType
)
}
DataType::Time32(TimeUnit::Millisecond) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Time32MillisecondType
)
}
DataType::Time64(TimeUnit::Microsecond) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Time64MicrosecondType
)
}
DataType::Time64(TimeUnit::Nanosecond) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Time64NanosecondType
)
}
DataType::Interval(IntervalUnit::YearMonth) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
IntervalYearMonthType
)
}
DataType::Interval(IntervalUnit::DayTime) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
IntervalDayTimeType
)
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
IntervalMonthDayNanoType
)
}
DataType::Decimal128(precision, scale) => {
generate_decimal_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
precision,
scale,
Decimal128Type
)
}
DataType::Decimal256(precision, scale) => {
generate_decimal_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
precision,
scale,
Decimal256Type
)
}
DataType::Utf8 => {
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i32
Utf8Type
)
}
DataType::LargeUtf8 => {
Expand All @@ -449,7 +568,17 @@ impl RecordBatchGenerator {
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i64
LargeUtf8Type
)
}
DataType::Utf8View => {
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
StringViewType
)
}
_ => {
Expand Down
4 changes: 4 additions & 0 deletions datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,10 @@ impl GroupsAccumulator for MinMaxBytesAccumulator {
/// This is a heuristic to avoid allocating too many small buffers
fn capacity_to_view_block_size(data_capacity: usize) -> u32 {
let max_block_size = 2 * 1024 * 1024;
// Avoid block size equal to zero when calling `with_fixed_block_size()`.
if data_capacity == 0 {
return 1;
}
if let Ok(block_size) = u32::try_from(data_capacity) {
block_size.min(max_block_size)
} else {
Expand Down
79 changes: 79 additions & 0 deletions test-utils/src/array_gen/decimal.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, PrimitiveArray, PrimitiveBuilder, UInt32Array};
use arrow::datatypes::DecimalType;
use rand::rngs::StdRng;
use rand::Rng;

use super::random_data::RandomNativeData;

/// Randomly generate decimal arrays
pub struct DecimalArrayGenerator {
/// The precision of the decimal type
pub precision: u8,
/// The scale of the decimal type
pub scale: i8,
/// The total number of decimals in the output
pub num_decimals: usize,
/// The number of distinct decimals in the columns
pub num_distinct_decimals: usize,
/// The percentage of nulls in the columns
pub null_pct: f64,
/// Random number generator
pub rng: StdRng,
}

impl DecimalArrayGenerator {
/// Create a Decimal128Array / Decimal256Array with random values.
pub fn gen_data<D>(&mut self) -> ArrayRef
where
D: DecimalType + RandomNativeData,
{
// table of decimals from which to draw
let distinct_decimals: PrimitiveArray<D> = {
let mut decimal_builder =
PrimitiveBuilder::<D>::with_capacity(self.num_distinct_decimals);
for _ in 0..self.num_distinct_decimals {
decimal_builder
.append_option(Some(D::generate_random_native_data(&mut self.rng)));
}

decimal_builder
.finish()
.with_precision_and_scale(self.precision, self.scale)
.unwrap()
};

// pick num_decimals randomly from the distinct decimal table
let indicies: UInt32Array = (0..self.num_decimals)
.map(|_| {
if self.rng.gen::<f64>() < self.null_pct {
None
} else if self.num_distinct_decimals > 1 {
let range = 1..(self.num_distinct_decimals as u32);
Some(self.rng.gen_range(range))
} else {
Some(0)
}
})
.collect();

let options = None;
arrow::compute::take(&distinct_decimals, &indicies, options).unwrap()
}
}
Loading

0 comments on commit f2344d2

Please sign in to comment.