-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extract parquet statistics to its own module, add tests #8294
Changes from 1 commit
d187e36
7512c8b
d4e660a
fd2aebc
96a42f9
a128a20
b4009c2
ef79c42
9b914db
b95dea9
cd3c042
ab95453
0235a9e
a601fbf
5c55302
06b5201
641142b
e5cd8cf
b1666c2
7022691
a5e235a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,31 +15,26 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use arrow::{ | ||
array::ArrayRef, | ||
datatypes::{DataType, Schema}, | ||
}; | ||
use arrow::{array::ArrayRef, datatypes::Schema}; | ||
use datafusion_common::tree_node::{TreeNode, VisitRecursion}; | ||
use datafusion_common::{Column, DataFusionError, Result, ScalarValue}; | ||
use parquet::{ | ||
arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder}, | ||
bloom_filter::Sbbf, | ||
file::{metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics}, | ||
file::metadata::RowGroupMetaData, | ||
}; | ||
use std::{ | ||
collections::{HashMap, HashSet}, | ||
sync::Arc, | ||
}; | ||
|
||
use crate::datasource::{ | ||
listing::FileRange, | ||
physical_plan::parquet::{from_bytes_to_i128, parquet_to_arrow_decimal_type}, | ||
}; | ||
use crate::datasource::listing::FileRange; | ||
use crate::logical_expr::Operator; | ||
use crate::physical_expr::expressions as phys_expr; | ||
use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; | ||
use crate::physical_plan::PhysicalExpr; | ||
|
||
use super::statistics::RowGroupStatisticsConverter; | ||
use super::ParquetFileMetrics; | ||
|
||
/// Prune row groups based on statistics | ||
|
@@ -303,112 +298,6 @@ struct RowGroupPruningStatistics<'a> { | |
parquet_schema: &'a Schema, | ||
} | ||
|
||
/// Extract the min/max statistics from a `ParquetStatistics` object | ||
macro_rules! get_statistic { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This macro is moved, without modification, into |
||
($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ | ||
if !$column_statistics.has_min_max_set() { | ||
return None; | ||
} | ||
match $column_statistics { | ||
ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), | ||
ParquetStatistics::Int32(s) => { | ||
match $target_arrow_type { | ||
// int32 to decimal with the precision and scale | ||
Some(DataType::Decimal128(precision, scale)) => { | ||
Some(ScalarValue::Decimal128( | ||
Some(*s.$func() as i128), | ||
precision, | ||
scale, | ||
)) | ||
} | ||
_ => Some(ScalarValue::Int32(Some(*s.$func()))), | ||
} | ||
} | ||
ParquetStatistics::Int64(s) => { | ||
match $target_arrow_type { | ||
// int64 to decimal with the precision and scale | ||
Some(DataType::Decimal128(precision, scale)) => { | ||
Some(ScalarValue::Decimal128( | ||
Some(*s.$func() as i128), | ||
precision, | ||
scale, | ||
)) | ||
} | ||
_ => Some(ScalarValue::Int64(Some(*s.$func()))), | ||
} | ||
} | ||
// 96 bit ints not supported | ||
ParquetStatistics::Int96(_) => None, | ||
ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), | ||
ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), | ||
ParquetStatistics::ByteArray(s) => { | ||
match $target_arrow_type { | ||
// decimal data type | ||
Some(DataType::Decimal128(precision, scale)) => { | ||
Some(ScalarValue::Decimal128( | ||
Some(from_bytes_to_i128(s.$bytes_func())), | ||
precision, | ||
scale, | ||
)) | ||
} | ||
_ => { | ||
let s = std::str::from_utf8(s.$bytes_func()) | ||
.map(|s| s.to_string()) | ||
.ok(); | ||
Some(ScalarValue::Utf8(s)) | ||
} | ||
} | ||
} | ||
// type not supported yet | ||
ParquetStatistics::FixedLenByteArray(s) => { | ||
match $target_arrow_type { | ||
// just support the decimal data type | ||
Some(DataType::Decimal128(precision, scale)) => { | ||
Some(ScalarValue::Decimal128( | ||
Some(from_bytes_to_i128(s.$bytes_func())), | ||
precision, | ||
scale, | ||
)) | ||
} | ||
_ => None, | ||
} | ||
} | ||
} | ||
}}; | ||
} | ||
|
||
// Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate | ||
macro_rules! get_min_max_values { | ||
($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ | ||
let (_column_index, field) = | ||
if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) { | ||
(v, f) | ||
} else { | ||
// Named column was not present | ||
return None; | ||
}; | ||
|
||
let data_type = field.data_type(); | ||
// The result may be None, because DataFusion doesn't have support for ScalarValues of the column type | ||
let null_scalar: ScalarValue = data_type.try_into().ok()?; | ||
|
||
$self.row_group_metadata | ||
.columns() | ||
.iter() | ||
.find(|c| c.column_descr().name() == &$column.name) | ||
.and_then(|c| if c.statistics().is_some() {Some((c.statistics().unwrap(), c.column_descr()))} else {None}) | ||
.map(|(stats, column_descr)| | ||
{ | ||
let target_data_type = parquet_to_arrow_decimal_type(column_descr); | ||
get_statistic!(stats, $func, $bytes_func, target_data_type) | ||
}) | ||
.flatten() | ||
// column either didn't have statistics at all or didn't have min/max values | ||
.or_else(|| Some(null_scalar.clone())) | ||
.and_then(|s| s.to_array().ok()) | ||
}} | ||
} | ||
|
||
// Extract the null count value on the ParquetStatistics | ||
macro_rules! get_null_count_values { | ||
($self:expr, $column:expr) => {{ | ||
|
@@ -431,11 +320,29 @@ macro_rules! get_null_count_values { | |
|
||
impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { | ||
fn min_values(&self, column: &Column) -> Option<ArrayRef> { | ||
get_min_max_values!(self, column, min, min_bytes) | ||
let field = self | ||
.parquet_schema | ||
.fields() | ||
.find(&column.name) | ||
.map(|(_idx, field)| field)?; | ||
|
||
RowGroupStatisticsConverter::new(field) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a slight mismatch here as parquet handles schema nesting differently from arrow I'm not sure how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TLDR is that Column does not address nested fields. The structure that does is datafusion_physical_expr::expressions::GetFieldAccessExpr or |
||
.min([self.row_group_metadata]) | ||
// ignore errors during conversion, and just use no statistics | ||
.ok() | ||
} | ||
|
||
fn max_values(&self, column: &Column) -> Option<ArrayRef> { | ||
get_min_max_values!(self, column, max, max_bytes) | ||
let field = self | ||
.parquet_schema | ||
.fields() | ||
.find(&column.name) | ||
.map(|(_idx, field)| field)?; | ||
|
||
RowGroupStatisticsConverter::new(field) | ||
.max([self.row_group_metadata]) | ||
// ignore errors during conversion, and just use no statistics | ||
.ok() | ||
} | ||
|
||
fn num_containers(&self) -> usize { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved to statistics.rs