From 9fe127082daefaa5b90ee56686fc2cc68aa6fa9c Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 18 Jul 2023 03:25:52 -0400 Subject: [PATCH] Parquet uses row group row count if missing from header (#13712) When investigating [this issue](https://github.com/rapidsai/cudf/issues/13664) I noticed that the file provided has 0 rows in the header. This caused cudf's parquet reader to fail at reading the file, but other tools such as `parq` and `parquet-tools` had no issues reading the file. This change counts up the number of rows in the row groups of the file and will complain loudly if the number differ, but not if the main header is 0. This allows us to properly read the data inside this file. Note that it will not properly parse it as a list of structs yet, that will be fixed in another PR. I didn't add a test since this is the only file I have seen with this issue and we can't read it yet in cudf. A test will be added for reading this file, which will test this change as well, with the PR for that issue. Authors: - Mike Wilson (https://github.com/hyperbolic2346) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/13712 --- cpp/src/io/parquet/reader_impl_helpers.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 006b8d69aad..a5f71394084 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -233,7 +233,13 @@ int64_t aggregate_reader_metadata::calc_num_rows() const { return std::accumulate( per_file_metadata.begin(), per_file_metadata.end(), 0l, [](auto& sum, auto& pfm) { - return sum + pfm.num_rows; + auto const rowgroup_rows = std::accumulate( + pfm.row_groups.begin(), pfm.row_groups.end(), 0l, [](auto& rg_sum, auto& rg) { + return rg_sum + rg.num_rows; + }); + CUDF_EXPECTS(pfm.num_rows == 0 || pfm.num_rows == rowgroup_rows, + "Header and row groups disagree about number of rows in file!"); + return sum + (pfm.num_rows == 0 && rowgroup_rows > 0 ? rowgroup_rows : pfm.num_rows); }); }