From 25c67552e339e38322720340c70d36b9388e40b4 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 14 Jun 2024 09:35:31 -0500 Subject: [PATCH 1/3] Use the number of rows from the entire subpass instead of just the last page when validating row counts for list columns in the chunked reader. --- cpp/src/io/parquet/reader_impl_preprocess.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 7cb982f103d..e0495fb116b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1436,7 +1436,8 @@ void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_li // subpass since we know that will safely completed. bool const is_list = chunk.max_level[level_type::REPETITION] > 0; if (is_list && max_col_row < last_pass_row) { - size_t const min_col_row = static_cast(chunk.start_row + last_page.chunk_row); + auto const& first_page = subpass.pages[page_index]; + size_t const min_col_row = static_cast(chunk.start_row + first_page.chunk_row); CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass"); max_col_row--; } From 28650230c743c45f5bf5a89704350b5944a5b8d5 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 24 Jun 2024 15:52:46 -0500 Subject: [PATCH 2/3] Make sure to always include 2 rows per subpass, to guarantee that we get 1 full row start/end for list columns. --- cpp/src/io/parquet/reader_impl_chunking.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 9ad5a2d6e8d..345589438a3 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -337,7 +337,8 @@ int64_t find_next_split(int64_t cur_pos, size_t cur_row_index, size_t cur_cumulative_size, cudf::host_span sizes, - size_t size_limit) + size_t size_limit, + size_t min_row_count) { auto const start = thrust::make_transform_iterator( sizes.begin(), @@ -357,7 +358,7 @@ int64_t find_next_split(int64_t cur_pos, // this guarantees that even if we cannot fit the set of rows represented by our where our cur_pos // is, we will still move forward instead of failing. while (split_pos < (static_cast(sizes.size()) - 1) && - (sizes[split_pos].end_row_index == cur_row_index)) { + (sizes[split_pos].end_row_index - cur_row_index < min_row_count)) { split_pos++; } @@ -657,8 +658,10 @@ std::tuple, size_t, size_t> compute_next_subpass( auto const start_index = find_start_index(h_aggregated_info, start_row); auto const cumulative_size = start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes; + // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all + // list columns have a clear start and end. auto const end_index = - find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit); + find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2); auto const end_row = h_aggregated_info[end_index].end_row_index; // for each column, collect the set of pages that spans start_row / end_row @@ -704,7 +707,7 @@ std::vector compute_page_splits_by_row(device_span Date: Wed, 26 Jun 2024 09:03:37 -0700 Subject: [PATCH 3/3] Formatting. --- cpp/src/io/parquet/reader_impl_chunking.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 345589438a3..d371ef5de93 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -658,7 +658,7 @@ std::tuple, size_t, size_t> compute_next_subpass( auto const start_index = find_start_index(h_aggregated_info, start_row); auto const cumulative_size = start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes; - // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all + // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all // list columns have a clear start and end. auto const end_index = find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2); @@ -706,8 +706,8 @@ std::vector compute_page_splits_by_row(device_span