Skip to content

Commit

Permalink
fix Incorrect statistics read for i8 i16 columns in parquet (apache#1…
Browse files Browse the repository at this point in the history
…0629)

* fix Incorrect statistics read for i8 i16 columns in parquet

* fix failed test

* Fix merge problem

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
Lordworms and alamb authored May 23, 2024
1 parent 19d9174 commit 8f3084a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
28 changes: 22 additions & 6 deletions datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ macro_rules! get_statistic {
*scale,
))
}
Some(DataType::Int8) => {
Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
}
Some(DataType::Int16) => {
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
}
Some(DataType::Date32) => {
Some(ScalarValue::Date32(Some(*s.$func())))
}
Expand Down Expand Up @@ -373,8 +379,8 @@ mod test {
use arrow::datatypes::{Date32Type, Date64Type};
use arrow_array::{
new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch,
StringArray, StructArray, TimestampNanosecondArray,
Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray,
};
use arrow_schema::{Field, SchemaRef};
use bytes::Bytes;
Expand Down Expand Up @@ -856,13 +862,13 @@ mod test {
})
.with_column(ExpectedColumn {
name: "tinyint_col",
expected_min: i32_array([Some(0)]),
expected_max: i32_array([Some(9)]),
expected_min: i8_array([Some(0)]),
expected_max: i8_array([Some(9)]),
})
.with_column(ExpectedColumn {
name: "smallint_col",
expected_min: i32_array([Some(0)]),
expected_max: i32_array([Some(9)]),
expected_min: i16_array([Some(0)]),
expected_max: i16_array([Some(9)]),
})
.with_column(ExpectedColumn {
name: "int_col",
Expand Down Expand Up @@ -1088,6 +1094,16 @@ mod test {
Arc::new(array)
}

fn i8_array(input: impl IntoIterator<Item = Option<i8>>) -> ArrayRef {
let array: Int8Array = input.into_iter().collect();
Arc::new(array)
}

fn i16_array(input: impl IntoIterator<Item = Option<i16>>) -> ArrayRef {
let array: Int16Array = input.into_iter().collect();
Arc::new(array)
}

fn i32_array(input: impl IntoIterator<Item = Option<i32>>) -> ArrayRef {
let array: Int32Array = input.into_iter().collect();
Arc::new(array)
Expand Down
2 changes: 0 additions & 2 deletions datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,6 @@ async fn test_int_32() {
// Note that the file has 4 columns named "i8", "i16", "i32", "i64".
// - The tests on column i32 and i64 passed.
// - The tests on column i8 and i16 failed.
#[ignore]
#[tokio::test]
async fn test_int_16() {
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
Expand Down Expand Up @@ -421,7 +420,6 @@ async fn test_int_16() {

// BUG (same as above): ignore this test for now
// https://github.com/apache/datafusion/issues/10585
#[ignore]
#[tokio::test]
async fn test_int_8() {
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
Expand Down

0 comments on commit 8f3084a

Please sign in to comment.