Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Parquet] Add benchmark and test for writing NaNs to Parquet #6955

Merged
merged 2 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ arrow-string = { workspace = true }

rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
pyo3 = { version = "0.23", default-features = false, optional = true }
half = { version = "2.1", default-features = false, optional = true }

[package.metadata.docs.rs]
features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"]
Expand All @@ -70,7 +71,7 @@ prettyprint = ["arrow-cast/prettyprint"]
# not the core arrow code itself. Be aware that `rand` must be kept as
# an optional dependency for supporting compile to wasm32-unknown-unknown
# target without assuming an environment containing JavaScript.
test_utils = ["dep:rand"]
test_utils = ["dep:rand", "dep:half"]
pyarrow = ["pyo3", "ffi"]
# force_validate runs full data validation for all arrays that are created
# this is not enabled by default as it is too computationally expensive
Expand Down
46 changes: 46 additions & 0 deletions arrow/src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::array::*;
use crate::datatypes::*;
use crate::util::test_util::seedable_rng;
use arrow_buffer::{Buffer, IntervalMonthDayNano};
use half::f16;
use rand::distributions::uniform::SampleUniform;
use rand::thread_rng;
use rand::Rng;
Expand Down Expand Up @@ -416,3 +417,48 @@ where

DictionaryArray::from(data)
}

/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f16::NAN)
} else {
Some(f16::from_f32(rng.gen()))
}
})
.collect()
}

/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f32::NAN)
} else {
Some(rng.gen())
}
})
.collect()
}

/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f64::NAN)
} else {
Some(rng.gen())
}
})
.collect()
}
33 changes: 33 additions & 0 deletions parquet/benches/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ extern crate parquet;
use std::sync::Arc;

use arrow::datatypes::*;
use arrow::util::bench_util::{create_f16_array, create_f32_array, create_f64_array};
use arrow::{record_batch::RecordBatch, util::data_gen::*};
use arrow_array::RecordBatchOptions;
use parquet::file::properties::WriterProperties;
use parquet::{arrow::ArrowWriter, errors::Result};

Expand Down Expand Up @@ -181,6 +183,25 @@ fn create_bool_bench_batch_non_null(
)?)
}

fn create_float_bench_batch_with_nans(size: usize, nan_density: f32) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Float16, false),
Field::new("_2", DataType::Float32, false),
Field::new("_3", DataType::Float64, false),
];
let schema = Schema::new(fields);
let columns: Vec<arrow_array::ArrayRef> = vec![
Arc::new(create_f16_array(size, nan_density)),
Arc::new(create_f32_array(size, nan_density)),
Arc::new(create_f64_array(size, nan_density)),
];
Ok(RecordBatch::try_new_with_options(
Arc::new(schema),
columns,
&RecordBatchOptions::new().with_match_field_names(false),
)?)
}

fn create_list_primitive_bench_batch(
size: usize,
null_density: f32,
Expand Down Expand Up @@ -459,6 +480,18 @@ fn bench_primitive_writer(c: &mut Criterion) {
b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap())
});

let batch = create_float_bench_batch_with_nans(4096, 0.5).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values float with NaNs", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

group.finish();
}

Expand Down
39 changes: 39 additions & 0 deletions parquet/src/arrow/arrow_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,7 @@ mod tests {
use arrow::{array::*, buffer::Buffer};
use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer};
use arrow_schema::Fields;
use half::f16;

use crate::basic::Encoding;
use crate::data_type::AsBytes;
Expand Down Expand Up @@ -1763,6 +1764,44 @@ mod tests {
);
}

#[test]
fn arrow_writer_float_nans() {
let f16_field = Field::new("a", DataType::Float16, false);
let f32_field = Field::new("b", DataType::Float32, false);
let f64_field = Field::new("c", DataType::Float64, false);
let schema = Schema::new(vec![f16_field, f32_field, f64_field]);

let f16_values = (0..MEDIUM_SIZE)
.map(|i| {
Some(if i % 2 == 0 {
f16::NAN
} else {
f16::from_f32(i as f32)
})
})
.collect::<Float16Array>();

let f32_values = (0..MEDIUM_SIZE)
.map(|i| Some(if i % 2 == 0 { f32::NAN } else { i as f32 }))
.collect::<Float32Array>();

let f64_values = (0..MEDIUM_SIZE)
.map(|i| Some(if i % 2 == 0 { f64::NAN } else { i as f64 }))
.collect::<Float64Array>();

let batch = RecordBatch::try_new(
Arc::new(schema),
vec![
Arc::new(f16_values),
Arc::new(f32_values),
Arc::new(f64_values),
],
)
.unwrap();

roundtrip(batch, None);
}

const SMALL_SIZE: usize = 7;
const MEDIUM_SIZE: usize = 63;

Expand Down
Loading