From e93d1ee98cd4674efd14a082b2f0e780e6a1d320 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 16 Jan 2024 15:24:17 +0100 Subject: [PATCH] GH-39562: [C++][Parquet] Fix crash in test_parquet_dataset_lazy_filtering (#39632) ### Rationale for this change `ParquetFileFragment` stores a `SchemaManifest` that has a raw pointer to a `SchemaDescriptor`. The `SchemaDescriptor` is originally provided by a `FileMetadata` instance but, in some cases, the `FileMetadata` instance can be destroyed while the `ParquetFileFragment` is still in use. This can typically lead to bugs or crashes. ### What changes are included in this PR? Ensure that `ParquetFileFragment` keeps an owning pointer to the `FileMetadata` instance that provides its `SchemaManifest`'s schema descriptor. ### Are these changes tested? An assertion is added that would fail deterministically in the Python test suite. ### Are there any user-facing changes? No. * Closes: #39562 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/dataset/file_parquet.cc | 14 +++++++++++--- cpp/src/arrow/dataset/file_parquet.h | 5 ++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0ce08502921f3..140917a2e6341 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -813,11 +813,17 @@ Status ParquetFileFragment::EnsureCompleteMetadata(parquet::arrow::FileReader* r Status ParquetFileFragment::SetMetadata( std::shared_ptr metadata, - std::shared_ptr manifest) { + std::shared_ptr manifest, + std::shared_ptr original_metadata) { DCHECK(row_groups_.has_value()); metadata_ = std::move(metadata); manifest_ = std::move(manifest); + original_metadata_ = original_metadata ? std::move(original_metadata) : metadata_; + // The SchemaDescriptor needs to be owned by a FileMetaData instance, + // because SchemaManifest only stores a raw pointer (GH-39562). + DCHECK_EQ(manifest_->descr, original_metadata_->schema()) + << "SchemaDescriptor should be owned by the original FileMetaData"; statistics_expressions_.resize(row_groups_->size(), compute::literal(true)); statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false); @@ -846,7 +852,8 @@ Result ParquetFileFragment::SplitByRowGroup( parquet_format_.MakeFragment(source_, partition_expression(), physical_schema_, {row_group})); - RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_)); + RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_, + /*original_metadata=*/original_metadata_)); fragments[i++] = std::move(fragment); } @@ -1106,7 +1113,8 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) format_->MakeFragment({path, filesystem_}, std::move(partition_expression), physical_schema_, std::move(row_groups))); - RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_)); + RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_, + /*original_metadata=*/metadata_)); fragments[i++] = std::move(fragment); } diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 1e81a34fb3cf0..5141f36385e3f 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -188,7 +188,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { std::optional> row_groups); Status SetMetadata(std::shared_ptr metadata, - std::shared_ptr manifest); + std::shared_ptr manifest, + std::shared_ptr original_metadata = {}); // Overridden to opportunistically set metadata since a reader must be opened anyway. Result> ReadPhysicalSchemaImpl() override { @@ -219,6 +220,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { std::vector statistics_expressions_complete_; std::shared_ptr metadata_; std::shared_ptr manifest_; + // The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest. + std::shared_ptr original_metadata_; friend class ParquetFileFormat; friend class ParquetDatasetFactory;