Skip to content

Commit

Permalink
apacheGH-39064: [C++][Parquet] Support row group filtering for nested…
Browse files Browse the repository at this point in the history
… paths for struct fields
  • Loading branch information
jorisvandenbossche committed Dec 4, 2023
1 parent f7947cc commit 3312381
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
18 changes: 12 additions & 6 deletions cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ bool IsNan(const Scalar& value) {
}

std::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
const FieldRef& field_ref, SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
// For the remaining of this function, failure to extract/parse statistics
// are ignored by returning nullptr. The goal is two fold. First
// avoid an optimization which breaks the computation. Second, allow the
Expand All @@ -177,7 +177,7 @@ std::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
return std::nullopt;
}

return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics);
return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, field_ref, *statistics);
}

void AddColumnIndices(const SchemaField& schema_field,
Expand Down Expand Up @@ -357,8 +357,8 @@ Result<bool> IsSupportedParquetFile(const ParquetFileFormat& format,
} // namespace

std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpression(
const Field& field, const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field.name());
const Field& field, const FieldRef& field_ref, const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0 && statistics.null_count() > 0) {
Expand Down Expand Up @@ -900,13 +900,19 @@ Result<std::vector<compute::Expression>> ParquetFileFragment::TestRowGroups(
if (statistics_expressions_complete_[match[0]]) continue;
statistics_expressions_complete_[match[0]] = true;

const SchemaField& schema_field = manifest_->schema_fields[match[0]];
SchemaField& schema_field = manifest_->schema_fields[match[0]];
for (size_t i = 1; i < match.indices().size(); ++i) {
if (schema_field.field->type()->id() != Type::STRUCT ) {
return Status::Invalid("nested paths only supported for structs");
}
schema_field = schema_field.children[match[i]];
}
int i = 0;
for (int row_group : *row_groups_) {
auto row_group_metadata = metadata_->RowGroup(row_group);

if (auto minmax =
ColumnChunkStatisticsAsExpression(schema_field, *row_group_metadata)) {
ColumnChunkStatisticsAsExpression(ref, schema_field, *row_group_metadata)) {
FoldingAnd(&statistics_expressions_[i], std::move(*minmax));
ARROW_ASSIGN_OR_RAISE(statistics_expressions_[i],
statistics_expressions_[i].Bind(*physical_schema_));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/file_parquet.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);

static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
const Field& field, const parquet::Statistics& statistics);
const Field& field, const FieldRef& field_ref, const parquet::Statistics& statistics);

private:
ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
Expand Down

0 comments on commit 3312381

Please sign in to comment.