From f4bb574aeb566794ed6442ca6a3dc5b9e587cbcc Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 29 Nov 2022 17:39:33 -0800 Subject: [PATCH] Fix page size calculation in Parquet writer (#12182) When calculating page boundaries, the current Parquet writer does not take into account storage needed per page for repetition and definition level data. As a consequence pages may sometimes exceed the specified limit, which in turn impacts the ability to compress these pages with codecs that have a maximum buffer size. This PR fixes the page size calculation to take repetition and definition levels into account. ~~This also incorporates the fragment size reduction from 5000 to 1000 that was suggested in #12130~~ Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/12182 --- cpp/src/io/parquet/page_enc.cu | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 21f06afa4e0..5a2e24b9ec2 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -352,12 +352,18 @@ __global__ void __launch_bounds__(128) ? frag_g.num_leaf_values * util::div_rounding_up_unsafe(ck_g.dict_rle_bits, 8) : frag_g.fragment_data_size; // TODO (dm): this convoluted logic to limit page size needs refactoring - size_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 - : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 - : 512 * 1024; + long this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 + : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 + : 512 * 1024; // override this_max_page_size if the requested size is smaller - this_max_page_size = min(this_max_page_size, max_page_size_bytes); + this_max_page_size = min(this_max_page_size, static_cast(max_page_size_bytes)); + + // subtract size of rep and def level vectors + auto num_vals = values_in_page + frag_g.num_values; + // underflow is ok here, it just means the test below will always be true + this_max_page_size -= max_RLE_page_size(col_g.num_def_level_bits(), num_vals) + + max_RLE_page_size(col_g.num_rep_level_bits(), num_vals); if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) ||