diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3afe4ec85cf49..27fb0ae214df2 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -158,7 +158,7 @@ bool IsNan(const Scalar& value) { } std::optional ColumnChunkStatisticsAsExpression( - const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { + const FieldRef& field_ref, SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { // For the remaining of this function, failure to extract/parse statistics // are ignored by returning nullptr. The goal is two fold. First // avoid an optimization which breaks the computation. Second, allow the @@ -177,7 +177,7 @@ std::optional ColumnChunkStatisticsAsExpression( return std::nullopt; } - return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); + return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, field_ref, *statistics); } void AddColumnIndices(const SchemaField& schema_field, @@ -357,8 +357,8 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( - const Field& field, const parquet::Statistics& statistics) { - auto field_expr = compute::field_ref(field.name()); + const Field& field, const FieldRef& field_ref, const parquet::Statistics& statistics) { + auto field_expr = compute::field_ref(field_ref); // Optimize for corner case where all values are nulls if (statistics.num_values() == 0 && statistics.null_count() > 0) { @@ -900,13 +900,19 @@ Result> ParquetFileFragment::TestRowGroups( if (statistics_expressions_complete_[match[0]]) continue; statistics_expressions_complete_[match[0]] = true; - const SchemaField& schema_field = manifest_->schema_fields[match[0]]; + SchemaField& schema_field = manifest_->schema_fields[match[0]]; + for (size_t i = 1; i < match.indices().size(); ++i) { + if (schema_field.field->type()->id() != Type::STRUCT ) { + return Status::Invalid("nested paths only supported for structs"); + } + schema_field = schema_field.children[match[i]]; + } int i = 0; for (int row_group : *row_groups_) { auto row_group_metadata = metadata_->RowGroup(row_group); if (auto minmax = - ColumnChunkStatisticsAsExpression(schema_field, *row_group_metadata)) { + ColumnChunkStatisticsAsExpression(ref, schema_field, *row_group_metadata)) { FoldingAnd(&statistics_expressions_[i], std::move(*minmax)); ARROW_ASSIGN_OR_RAISE(statistics_expressions_[i], statistics_expressions_[i].Bind(*physical_schema_)); diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index f527ce5d70ae0..c09101e58f74b 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -175,7 +175,7 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(std::vector row_group_ids); static std::optional EvaluateStatisticsAsExpression( - const Field& field, const parquet::Statistics& statistics); + const Field& field, const FieldRef& field_ref, const parquet::Statistics& statistics); private: ParquetFileFragment(FileSource source, std::shared_ptr format,