From 9fe14b5086f4d046fcd750f5b8f78012ee874a7b Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 7 Apr 2022 16:35:57 -0700 Subject: [PATCH 01/15] add setting of target page size for parquet file writer --- cpp/include/cudf/io/parquet.hpp | 81 +++++++++++++++++++----------- cpp/src/io/parquet/page_enc.cu | 12 ++++- cpp/src/io/parquet/parquet_gpu.hpp | 1 + cpp/src/io/parquet/writer_impl.cu | 5 +- cpp/src/io/parquet/writer_impl.hpp | 1 + 5 files changed, 69 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 2ceac947c8d..1cb26c98419 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,6 +39,7 @@ namespace io { constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB constexpr size_type default_row_group_size_rows = 1000000; +constexpr size_type default_target_page_size = 512 * 1024; /** * @brief Builds parquet_reader_options to use for `read_parquet()`. @@ -382,6 +383,8 @@ class parquet_writer_options { size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; + // target page size uncompressed + size_t _target_page_size = default_target_page_size; /** * @brief Constructor from sink and table. @@ -482,6 +485,11 @@ class parquet_writer_options { */ auto get_row_group_size_rows() const { return _row_group_size_rows; } + /** + * @brief Returns target page size, in bytes. + */ + auto get_target_page_size() const { return _target_page_size; } + /** * @brief Sets partitions. * @@ -552,24 +560,17 @@ class parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. */ - void set_row_group_size_bytes(size_t size_bytes) - { - CUDF_EXPECTS( - size_bytes >= 512 * 1024, - "The maximum row group size cannot be smaller than the page size, which is 512KB."); - _row_group_size_bytes = size_bytes; - } + void set_row_group_size_bytes(size_t size_bytes) { _row_group_size_bytes = size_bytes; } /** * @brief Sets the maximum row group size, in rows. */ - void set_row_group_size_rows(size_type size_rows) - { - CUDF_EXPECTS( - size_rows >= 5000, - "The maximum row group size cannot be smaller than the page size, which is 5000 rows."); - _row_group_size_rows = size_rows; - } + void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } + + /** + * @brief Returns target page size. + */ + void set_target_page_size(size_t target_pgsz) { _target_page_size = target_pgsz; } }; class parquet_writer_options_builder { @@ -699,6 +700,18 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Sets target page size, in bytes. + * + * @param ps The page size to use. + * @return this for chaining. + */ + parquet_writer_options_builder& target_page_size(size_t target_pgsz) + { + options._target_page_size = target_pgsz; + return *this; + } + /** * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. * @@ -783,6 +796,8 @@ class chunked_parquet_writer_options { size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; + // target page size uncompressed + size_t _target_page_size = default_target_page_size; /** * @brief Constructor from sink. @@ -844,6 +859,11 @@ class chunked_parquet_writer_options { */ auto get_row_group_size_rows() const { return _row_group_size_rows; } + /** + * @brief Returns target page size, in bytes. + */ + auto get_target_page_size() const { return _target_page_size; } + /** * @brief Sets metadata. * @@ -888,24 +908,17 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. */ - void set_row_group_size_bytes(size_t size_bytes) - { - CUDF_EXPECTS( - size_bytes >= 512 * 1024, - "The maximum row group size cannot be smaller than the page size, which is 512KB."); - _row_group_size_bytes = size_bytes; - } + void set_row_group_size_bytes(size_t size_bytes) { _row_group_size_bytes = size_bytes; } /** * @brief Sets the maximum row group size, in rows. */ - void set_row_group_size_rows(size_type size_rows) - { - CUDF_EXPECTS( - size_rows >= 5000, - "The maximum row group size cannot be smaller than the page size, which is 5000 rows."); - _row_group_size_rows = size_rows; - } + void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } + + /** + * @brief Returns target page size. + */ + void set_target_page_size(size_t target_pgsz) { _target_page_size = target_pgsz; } /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1025,6 +1038,18 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Sets target page size in parquet_writer_options. + * + * @param ps The page size to use. + * @return this for chaining. + */ + chunked_parquet_writer_options_builder& target_page_size(size_t target_pgsz) + { + options._target_page_size = target_pgsz; + return *this; + } + /** * @brief move chunked_parquet_writer_options member once it's built. */ diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index da671d4c665..799b6b38cdf 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -242,7 +242,8 @@ __global__ void __launch_bounds__(128) statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, - int32_t num_columns) + int32_t num_columns, + size_t target_page_size) { // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach __shared__ __align__(8) parquet_column_device_view col_g; @@ -338,6 +339,11 @@ __global__ void __launch_bounds__(128) uint32_t max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 : 512 * 1024; + + // override max_page_size if target is smaller + if (max_page_size > target_page_size) + max_page_size = target_page_size; + if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > max_page_size))) { if (ck_g.use_dictionary) { @@ -1941,6 +1947,7 @@ void InitEncoderPages(device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, + size_t target_page_size, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, @@ -1949,7 +1956,8 @@ void InitEncoderPages(device_2dspan chunks, auto num_rowgroups = chunks.size().first; dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup gpuInitPages<<>>( - chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns); + chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns, + target_page_size); } void EncodePages(device_span pages, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8d0aa8881c3..8bd34ad6323 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -579,6 +579,7 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, + size_t target_page_size, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 70a594423c9..91d2d6bebc5 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -893,7 +893,7 @@ void writer::impl::init_page_sizes(hostdevice_2dvector& chu uint32_t num_columns) { chunks.host_to_device(stream); - gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, 0, stream); + gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, target_page_size_, nullptr, nullptr, 0, stream); chunks.device_to_host(stream, true); } @@ -999,6 +999,7 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& pages, col_desc, num_columns, + target_page_size_, (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, max_page_comp_data_size, @@ -1153,6 +1154,7 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, + target_page_size_(options.get_target_page_size()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), @@ -1175,6 +1177,7 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, + target_page_size_(options.get_target_page_size()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 405ab0c2880..8c3b9870abd 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -210,6 +210,7 @@ class writer::impl { size_t max_row_group_size = default_row_group_size_bytes; size_type max_row_group_rows = default_row_group_size_rows; + size_t target_page_size_ = default_target_page_size; Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; From 28c58064e986269e7772720d64b438a3aafc8d9d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 13 May 2022 15:20:37 -0700 Subject: [PATCH 02/15] split target page size into bytes and rows to match row group size --- cpp/include/cudf/io/parquet.hpp | 87 +++++++++++++++++++++++------- cpp/src/io/parquet/page_enc.cu | 15 +++--- cpp/src/io/parquet/parquet_gpu.hpp | 3 +- cpp/src/io/parquet/writer_impl.cu | 12 +++-- cpp/src/io/parquet/writer_impl.hpp | 3 +- 5 files changed, 89 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 1cb26c98419..59302f24943 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,7 +39,8 @@ namespace io { constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB constexpr size_type default_row_group_size_rows = 1000000; -constexpr size_type default_target_page_size = 512 * 1024; +constexpr size_t default_target_page_size_bytes = 512 * 1024; +constexpr size_type default_target_page_size_rows = 20000; /** * @brief Builds parquet_reader_options to use for `read_parquet()`. @@ -383,8 +384,10 @@ class parquet_writer_options { size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; - // target page size uncompressed - size_t _target_page_size = default_target_page_size; + // Maximum size of each page (uncompressed) + size_t _target_page_size_bytes = default_target_page_size_bytes; + // Maximum number of rows in a page + size_type _target_page_size_rows = default_target_page_size_rows; /** * @brief Constructor from sink and table. @@ -488,7 +491,12 @@ class parquet_writer_options { /** * @brief Returns target page size, in bytes. */ - auto get_target_page_size() const { return _target_page_size; } + auto get_target_page_size_bytes() const { return _target_page_size_bytes; } + + /** + * @brief Returns target page size, in rows. + */ + auto get_target_page_size_rows() const { return _target_page_size_rows; } /** * @brief Sets partitions. @@ -568,9 +576,14 @@ class parquet_writer_options { void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } /** - * @brief Returns target page size. + * @brief Sets the maximum page size, in bytes. */ - void set_target_page_size(size_t target_pgsz) { _target_page_size = target_pgsz; } + void set_target_page_size_bytes(size_t pgsz_bytes) { _target_page_size_bytes = pgsz_bytes; } + + /** + * @brief Sets the maximum page size, in rows. + */ + void set_target_page_size_rows(size_type pgsz_rows) { _target_page_size_rows = pgsz_rows; } }; class parquet_writer_options_builder { @@ -701,14 +714,26 @@ class parquet_writer_options_builder { } /** - * @brief Sets target page size, in bytes. + * @brief Sets the maximum page size, in bytes. + * + * @param val The page size to use. + * @return this for chaining. + */ + parquet_writer_options_builder& target_page_size_bytes(size_t val) + { + options.set_target_page_size_bytes(val); + return *this; + } + + /** + * @brief Sets the maximum page size, in rows. * - * @param ps The page size to use. + * @param val The page size to use. * @return this for chaining. */ - parquet_writer_options_builder& target_page_size(size_t target_pgsz) + parquet_writer_options_builder& target_page_size_bytes(size_type val) { - options._target_page_size = target_pgsz; + options.set_target_page_size_rows(val); return *this; } @@ -796,8 +821,10 @@ class chunked_parquet_writer_options { size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; - // target page size uncompressed - size_t _target_page_size = default_target_page_size; + // Maximum size of each page (uncompressed) + size_t _target_page_size_bytes = default_target_page_size_bytes; + // Maximum number of rows in a page + size_type _target_page_size_rows = default_target_page_size_rows; /** * @brief Constructor from sink. @@ -862,7 +889,12 @@ class chunked_parquet_writer_options { /** * @brief Returns target page size, in bytes. */ - auto get_target_page_size() const { return _target_page_size; } + auto get_target_page_size_bytes() const { return _target_page_size_bytes; } + + /** + * @brief Returns target page size, in rows. + */ + auto get_target_page_size_rows() const { return _target_page_size_rows; } /** * @brief Sets metadata. @@ -916,9 +948,14 @@ class chunked_parquet_writer_options { void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } /** - * @brief Returns target page size. + * @brief Sets the maximum page size, in bytes. */ - void set_target_page_size(size_t target_pgsz) { _target_page_size = target_pgsz; } + void set_target_page_size_bytes(size_t pgsz_bytes) { _target_page_size_bytes = pgsz_bytes; } + + /** + * @brief Sets the maximum page size, in rows. + */ + void set_target_page_size_rows(size_type pgsz_rows) { _target_page_size_rows = pgsz_rows; } /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1039,14 +1076,26 @@ class chunked_parquet_writer_options_builder { } /** - * @brief Sets target page size in parquet_writer_options. + * @brief Sets the maximum page size, in bytes. + * + * @param val maximum page size + * @return this for chaining. + */ + chunked_parquet_writer_options_builder& target_page_size_bytes(size_t val) + { + options.set_target_page_size_bytes(val); + return *this; + } + + /** + * @brief Sets the maximum page size, in rows. * - * @param ps The page size to use. + * @param val maximum page size * @return this for chaining. */ - chunked_parquet_writer_options_builder& target_page_size(size_t target_pgsz) + chunked_parquet_writer_options_builder& target_page_size_rows(size_type val) { - options._target_page_size = target_pgsz; + options.set_target_page_size_rows(val); return *this; } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 799b6b38cdf..e35b0cd54f8 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -243,7 +243,8 @@ __global__ void __launch_bounds__(128) statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, int32_t num_columns, - size_t target_page_size) + size_t target_page_size_bytes, + size_type target_page_size_rows) { // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach __shared__ __align__(8) parquet_column_device_view col_g; @@ -341,11 +342,12 @@ __global__ void __launch_bounds__(128) : 512 * 1024; // override max_page_size if target is smaller - if (max_page_size > target_page_size) - max_page_size = target_page_size; + if (max_page_size > target_page_size_bytes) + max_page_size = target_page_size_bytes; if (num_rows >= ck_g.num_rows || - (values_in_page > 0 && (page_size + fragment_data_size > max_page_size))) { + (values_in_page > 0 && (page_size + fragment_data_size > max_page_size)) || + rows_in_page > target_page_size_rows) { if (ck_g.use_dictionary) { page_size = 1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8); @@ -1947,7 +1949,8 @@ void InitEncoderPages(device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - size_t target_page_size, + size_t target_page_size_bytes, + size_type target_page_size_rows, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, @@ -1957,7 +1960,7 @@ void InitEncoderPages(device_2dspan chunks, dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup gpuInitPages<<>>( chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns, - target_page_size); + target_page_size_bytes, target_page_size_rows); } void EncodePages(device_span pages, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8bd34ad6323..5047ebcf1f7 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -579,7 +579,8 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - size_t target_page_size, + size_t target_page_size_bytes, + size_type target_page_size_rows, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 91d2d6bebc5..4577e3865df 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -893,7 +893,8 @@ void writer::impl::init_page_sizes(hostdevice_2dvector& chu uint32_t num_columns) { chunks.host_to_device(stream); - gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, target_page_size_, nullptr, nullptr, 0, stream); + gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, target_page_size_bytes, + target_page_size_rows, nullptr, nullptr, 0, stream); chunks.device_to_host(stream, true); } @@ -999,7 +1000,8 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& pages, col_desc, num_columns, - target_page_size_, + target_page_size_bytes, + target_page_size_rows, (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, max_page_comp_data_size, @@ -1154,7 +1156,8 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - target_page_size_(options.get_target_page_size()), + target_page_size_bytes(options.get_target_page_size_bytes()), + target_page_size_rows(options.get_target_page_size_rows()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), @@ -1177,7 +1180,8 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - target_page_size_(options.get_target_page_size()), + target_page_size_bytes(options.get_target_page_size_bytes()), + target_page_size_rows(options.get_target_page_size_rows()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 8c3b9870abd..98870163915 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -210,7 +210,8 @@ class writer::impl { size_t max_row_group_size = default_row_group_size_bytes; size_type max_row_group_rows = default_row_group_size_rows; - size_t target_page_size_ = default_target_page_size; + size_t target_page_size_bytes = default_target_page_size_bytes; + size_type target_page_size_rows = default_target_page_size_rows; Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; From 1e730ff7147688df2c1e9b4cafc9d0e23f3abd1d Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 13 May 2022 16:06:03 -0700 Subject: [PATCH 03/15] remove row group size unit test for now --- cpp/tests/io/parquet_test.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 3905df2b274..67fd8a01b10 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3184,6 +3184,10 @@ TEST_F(ParquetReaderTest, EmptyOutput) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +/* temporarily remove this test. with page size being tunable, can no longer just test for + * row groups being smaller than 512KiB. ideally one would use the configured page size, but + * then that would require setting the page size parameter before the row group size, which + * seems odd. should probably test for consistent parameters when instantiating the writer. TEST_F(ParquetWriterTest, RowGroupSizeInvalid) { const auto unused_table = std::make_unique(); @@ -3205,5 +3209,6 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid) .row_group_size_bytes(511 << 10), cudf::logic_error); } +*/ CUDF_TEST_PROGRAM_MAIN() From 8534232e1303968ad0d21e9440c3aa3586189cc6 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 17 May 2022 14:05:21 -0700 Subject: [PATCH 04/15] clang-format changes --- cpp/include/cudf/io/parquet.hpp | 6 +++--- cpp/src/io/parquet/page_enc.cu | 15 ++++++++++----- cpp/src/io/parquet/writer_impl.cu | 12 ++++++++++-- cpp/tests/io/parquet_test.cpp | 2 +- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 0f927976e82..1eed1421abd 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -37,9 +37,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB -constexpr size_type default_row_group_size_rows = 1000000; -constexpr size_t default_target_page_size_bytes = 512 * 1024; +constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB +constexpr size_type default_row_group_size_rows = 1000000; +constexpr size_t default_target_page_size_bytes = 512 * 1024; constexpr size_type default_target_page_size_rows = 20000; /** diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index ee63c0a758c..9fff984884e 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -341,8 +341,7 @@ __global__ void __launch_bounds__(128) : 512 * 1024; // override max_page_size if target is smaller - if (max_page_size > target_page_size_bytes) - max_page_size = target_page_size_bytes; + if (max_page_size > target_page_size_bytes) max_page_size = target_page_size_bytes; if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > max_page_size)) || @@ -1944,9 +1943,15 @@ void InitEncoderPages(device_2dspan chunks, { auto num_rowgroups = chunks.size().first; dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup - gpuInitPages<<>>( - chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns, - target_page_size_bytes, target_page_size_rows); + gpuInitPages<<>>(chunks, + pages, + col_desc, + page_grstats, + chunk_grstats, + max_page_comp_data_size, + num_columns, + target_page_size_bytes, + target_page_size_rows); } void EncodePages(device_span pages, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 59c9ba800d1..f46ded26dcf 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -859,8 +859,16 @@ void writer::impl::init_page_sizes(hostdevice_2dvector& chu uint32_t num_columns) { chunks.host_to_device(stream); - gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, target_page_size_bytes, - target_page_size_rows, nullptr, nullptr, 0, stream); + gpu::InitEncoderPages(chunks, + {}, + col_desc, + num_columns, + target_page_size_bytes, + target_page_size_rows, + nullptr, + nullptr, + 0, + stream); chunks.device_to_host(stream, true); } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 67fd8a01b10..4d791a1c46c 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3186,7 +3186,7 @@ TEST_F(ParquetReaderTest, EmptyOutput) /* temporarily remove this test. with page size being tunable, can no longer just test for * row groups being smaller than 512KiB. ideally one would use the configured page size, but - * then that would require setting the page size parameter before the row group size, which + * then that would require setting the page size parameter before the row group size, which * seems odd. should probably test for consistent parameters when instantiating the writer. TEST_F(ParquetWriterTest, RowGroupSizeInvalid) { From 938c3b33b727a180dc5d90544b07de9055938638 Mon Sep 17 00:00:00 2001 From: etseidl Date: Wed, 18 May 2022 11:42:46 -0700 Subject: [PATCH 05/15] make argument names consistent per suggestions from code review Co-authored-by: Bradley Dice --- cpp/include/cudf/io/parquet.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 1eed1421abd..04ba25488cf 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -578,12 +578,12 @@ class parquet_writer_options { /** * @brief Sets the maximum page size, in bytes. */ - void set_target_page_size_bytes(size_t pgsz_bytes) { _target_page_size_bytes = pgsz_bytes; } + void set_target_page_size_bytes(size_t size_bytes) { _target_page_size_bytes = size_bytes; } /** * @brief Sets the maximum page size, in rows. */ - void set_target_page_size_rows(size_type pgsz_rows) { _target_page_size_rows = pgsz_rows; } + void set_target_page_size_rows(size_type size_rows) { _target_page_size_rows = size_rows; } }; class parquet_writer_options_builder { From 6c77bfc303101fbd6e8e660b9658ef57561d0f32 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 18 May 2022 12:00:00 -0700 Subject: [PATCH 06/15] change target_page_size to max_page_size to be more clear about the intent of the parameter --- cpp/include/cudf/io/parquet.hpp | 48 +++++++++++++++--------------- cpp/src/io/parquet/page_enc.cu | 16 +++++----- cpp/src/io/parquet/parquet_gpu.hpp | 4 +-- cpp/src/io/parquet/writer_impl.cu | 16 +++++----- cpp/src/io/parquet/writer_impl.hpp | 4 +-- 5 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 04ba25488cf..b9815bcd05f 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -37,10 +37,10 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB -constexpr size_type default_row_group_size_rows = 1000000; -constexpr size_t default_target_page_size_bytes = 512 * 1024; -constexpr size_type default_target_page_size_rows = 20000; +constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB +constexpr size_type default_row_group_size_rows = 1000000; +constexpr size_t default_max_page_size_bytes = 512 * 1024; +constexpr size_type default_max_page_size_rows = 20000; /** * @brief Builds parquet_reader_options to use for `read_parquet()`. @@ -385,9 +385,9 @@ class parquet_writer_options { // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; // Maximum size of each page (uncompressed) - size_t _target_page_size_bytes = default_target_page_size_bytes; + size_t _max_page_size_bytes = default_max_page_size_bytes; // Maximum number of rows in a page - size_type _target_page_size_rows = default_target_page_size_rows; + size_type _max_page_size_rows = default_max_page_size_rows; /** * @brief Constructor from sink and table. @@ -491,12 +491,12 @@ class parquet_writer_options { /** * @brief Returns target page size, in bytes. */ - auto get_target_page_size_bytes() const { return _target_page_size_bytes; } + auto get_max_page_size_bytes() const { return _max_page_size_bytes; } /** * @brief Returns target page size, in rows. */ - auto get_target_page_size_rows() const { return _target_page_size_rows; } + auto get_max_page_size_rows() const { return _max_page_size_rows; } /** * @brief Sets partitions. @@ -578,12 +578,12 @@ class parquet_writer_options { /** * @brief Sets the maximum page size, in bytes. */ - void set_target_page_size_bytes(size_t size_bytes) { _target_page_size_bytes = size_bytes; } + void set_max_page_size_bytes(size_t size_bytes) { _max_page_size_bytes = size_bytes; } /** * @brief Sets the maximum page size, in rows. */ - void set_target_page_size_rows(size_type size_rows) { _target_page_size_rows = size_rows; } + void set_max_page_size_rows(size_type size_rows) { _max_page_size_rows = size_rows; } }; class parquet_writer_options_builder { @@ -719,9 +719,9 @@ class parquet_writer_options_builder { * @param val The page size to use. * @return this for chaining. */ - parquet_writer_options_builder& target_page_size_bytes(size_t val) + parquet_writer_options_builder& max_page_size_bytes(size_t val) { - options.set_target_page_size_bytes(val); + options.set_max_page_size_bytes(val); return *this; } @@ -731,9 +731,9 @@ class parquet_writer_options_builder { * @param val The page size to use. * @return this for chaining. */ - parquet_writer_options_builder& target_page_size_bytes(size_type val) + parquet_writer_options_builder& max_page_size_rows(size_type val) { - options.set_target_page_size_rows(val); + options.set_max_page_size_rows(val); return *this; } @@ -822,9 +822,9 @@ class chunked_parquet_writer_options { // Maximum number of rows in row group (unless smaller than a single page) size_type _row_group_size_rows = default_row_group_size_rows; // Maximum size of each page (uncompressed) - size_t _target_page_size_bytes = default_target_page_size_bytes; + size_t _max_page_size_bytes = default_max_page_size_bytes; // Maximum number of rows in a page - size_type _target_page_size_rows = default_target_page_size_rows; + size_type _max_page_size_rows = default_max_page_size_rows; /** * @brief Constructor from sink. @@ -889,12 +889,12 @@ class chunked_parquet_writer_options { /** * @brief Returns target page size, in bytes. */ - auto get_target_page_size_bytes() const { return _target_page_size_bytes; } + auto get_max_page_size_bytes() const { return _max_page_size_bytes; } /** * @brief Returns target page size, in rows. */ - auto get_target_page_size_rows() const { return _target_page_size_rows; } + auto get_max_page_size_rows() const { return _max_page_size_rows; } /** * @brief Sets metadata. @@ -950,12 +950,12 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum page size, in bytes. */ - void set_target_page_size_bytes(size_t pgsz_bytes) { _target_page_size_bytes = pgsz_bytes; } + void set_max_page_size_bytes(size_t pgsz_bytes) { _max_page_size_bytes = pgsz_bytes; } /** * @brief Sets the maximum page size, in rows. */ - void set_target_page_size_rows(size_type pgsz_rows) { _target_page_size_rows = pgsz_rows; } + void set_max_page_size_rows(size_type pgsz_rows) { _max_page_size_rows = pgsz_rows; } /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1081,9 +1081,9 @@ class chunked_parquet_writer_options_builder { * @param val maximum page size * @return this for chaining. */ - chunked_parquet_writer_options_builder& target_page_size_bytes(size_t val) + chunked_parquet_writer_options_builder& max_page_size_bytes(size_t val) { - options.set_target_page_size_bytes(val); + options.set_max_page_size_bytes(val); return *this; } @@ -1093,9 +1093,9 @@ class chunked_parquet_writer_options_builder { * @param val maximum page size * @return this for chaining. */ - chunked_parquet_writer_options_builder& target_page_size_rows(size_type val) + chunked_parquet_writer_options_builder& max_page_size_rows(size_type val) { - options.set_target_page_size_rows(val); + options.set_max_page_size_rows(val); return *this; } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 9fff984884e..0faba491e2d 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -241,8 +241,8 @@ __global__ void __launch_bounds__(128) statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, int32_t num_columns, - size_t target_page_size_bytes, - size_type target_page_size_rows) + size_t max_page_size_bytes, + size_type max_page_size_rows) { // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach __shared__ __align__(8) parquet_column_device_view col_g; @@ -341,11 +341,11 @@ __global__ void __launch_bounds__(128) : 512 * 1024; // override max_page_size if target is smaller - if (max_page_size > target_page_size_bytes) max_page_size = target_page_size_bytes; + if (max_page_size > max_page_size_bytes) max_page_size = max_page_size_bytes; if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > max_page_size)) || - rows_in_page > target_page_size_rows) { + rows_in_page > max_page_size_rows) { if (ck_g.use_dictionary) { page_size = 1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8); @@ -1934,8 +1934,8 @@ void InitEncoderPages(device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - size_t target_page_size_bytes, - size_type target_page_size_rows, + size_t max_page_size_bytes, + size_type max_page_size_rows, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, @@ -1950,8 +1950,8 @@ void InitEncoderPages(device_2dspan chunks, chunk_grstats, max_page_comp_data_size, num_columns, - target_page_size_bytes, - target_page_size_rows); + max_page_size_bytes, + max_page_size_rows); } void EncodePages(device_span pages, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1e999f2aaab..1c8e30c51ec 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -575,8 +575,8 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - size_t target_page_size_bytes, - size_type target_page_size_rows, + size_t max_page_size_bytes, + size_type max_page_size_rows, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, size_t max_page_comp_data_size, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f46ded26dcf..062c9378d1d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -863,8 +863,8 @@ void writer::impl::init_page_sizes(hostdevice_2dvector& chu {}, col_desc, num_columns, - target_page_size_bytes, - target_page_size_rows, + max_page_size_bytes, + max_page_size_rows, nullptr, nullptr, 0, @@ -974,8 +974,8 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& pages, col_desc, num_columns, - target_page_size_bytes, - target_page_size_rows, + max_page_size_bytes, + max_page_size_rows, (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, max_page_comp_data_size, @@ -1133,8 +1133,8 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - target_page_size_bytes(options.get_target_page_size_bytes()), - target_page_size_rows(options.get_target_page_size_rows()), + max_page_size_bytes(options.get_max_page_size_bytes()), + max_page_size_rows(options.get_max_page_size_rows()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), @@ -1157,8 +1157,8 @@ writer::impl::impl(std::vector> sinks, stream(stream), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - target_page_size_bytes(options.get_target_page_size_bytes()), - target_page_size_rows(options.get_target_page_size_rows()), + max_page_size_bytes(options.get_max_page_size_bytes()), + max_page_size_rows(options.get_max_page_size_rows()), compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 98870163915..a8be43eb1ed 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -210,8 +210,8 @@ class writer::impl { size_t max_row_group_size = default_row_group_size_bytes; size_type max_row_group_rows = default_row_group_size_rows; - size_t target_page_size_bytes = default_target_page_size_bytes; - size_type target_page_size_rows = default_target_page_size_rows; + size_t max_page_size_bytes = default_max_page_size_bytes; + size_type max_page_size_rows = default_max_page_size_rows; Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; From cf02a0dd6a9ce2caaab51647f21a85b80acb9d35 Mon Sep 17 00:00:00 2001 From: etseidl Date: Wed, 18 May 2022 12:14:35 -0700 Subject: [PATCH 07/15] forgot to change "target" to "maximum" in commencts Co-authored-by: Bradley Dice --- cpp/include/cudf/io/parquet.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index b9815bcd05f..e7be2dc53e0 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -489,12 +489,12 @@ class parquet_writer_options { auto get_row_group_size_rows() const { return _row_group_size_rows; } /** - * @brief Returns target page size, in bytes. + * @brief Returns maximum page size, in bytes. */ auto get_max_page_size_bytes() const { return _max_page_size_bytes; } /** - * @brief Returns target page size, in rows. + * @brief Returns maximum page size, in rows. */ auto get_max_page_size_rows() const { return _max_page_size_rows; } @@ -887,12 +887,12 @@ class chunked_parquet_writer_options { auto get_row_group_size_rows() const { return _row_group_size_rows; } /** - * @brief Returns target page size, in bytes. + * @brief Returns maximum page size, in bytes. */ auto get_max_page_size_bytes() const { return _max_page_size_bytes; } /** - * @brief Returns target page size, in rows. + * @brief Returns maximum page size, in rows. */ auto get_max_page_size_rows() const { return _max_page_size_rows; } From cab058e2021448bdd009f36bf61e6ca61864bc7c Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 19 May 2022 13:16:03 -0700 Subject: [PATCH 08/15] ensure page size bytes is less than or equal to row group size bytes update documentation --- cpp/include/cudf/io/parquet.hpp | 40 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index e7be2dc53e0..2cb26e2d0ce 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -489,9 +489,10 @@ class parquet_writer_options { auto get_row_group_size_rows() const { return _row_group_size_rows; } /** - * @brief Returns maximum page size, in bytes. + * @brief Returns the maximum uncompressed page size, in bytes. If set larger than the row group size, + * then this will return the row group size. */ - auto get_max_page_size_bytes() const { return _max_page_size_bytes; } + auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } /** * @brief Returns maximum page size, in rows. @@ -576,7 +577,7 @@ class parquet_writer_options { void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } /** - * @brief Sets the maximum page size, in bytes. + * @brief Sets the maximum uncompressed page size, in bytes. */ void set_max_page_size_bytes(size_t size_bytes) { _max_page_size_bytes = size_bytes; } @@ -704,7 +705,7 @@ class parquet_writer_options_builder { /** * @brief Sets the maximum number of rows in output row groups. * - * @param val maximum number or rows + * @param val maximum number of rows * @return this for chaining. */ parquet_writer_options_builder& row_group_size_rows(size_type val) @@ -714,9 +715,11 @@ class parquet_writer_options_builder { } /** - * @brief Sets the maximum page size, in bytes. + * @brief Sets the maximum uncompressed page size, in bytes. Serves as a hint to the writer, + * and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, + * and will be adjusted to match if it is. * - * @param val The page size to use. + * @param val maximum page size * @return this for chaining. */ parquet_writer_options_builder& max_page_size_bytes(size_t val) @@ -726,9 +729,9 @@ class parquet_writer_options_builder { } /** - * @brief Sets the maximum page size, in rows. + * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. * - * @param val The page size to use. + * @param val maximum rows per page * @return this for chaining. */ parquet_writer_options_builder& max_page_size_rows(size_type val) @@ -887,9 +890,10 @@ class chunked_parquet_writer_options { auto get_row_group_size_rows() const { return _row_group_size_rows; } /** - * @brief Returns maximum page size, in bytes. + * @brief Returns maximum uncompressed page size, in bytes. If set larger than the row group size, + * then this will return the row group size. */ - auto get_max_page_size_bytes() const { return _max_page_size_bytes; } + auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } /** * @brief Returns maximum page size, in rows. @@ -948,14 +952,14 @@ class chunked_parquet_writer_options { void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } /** - * @brief Sets the maximum page size, in bytes. + * @brief Sets the maximum uncompressed page size, in bytes. */ - void set_max_page_size_bytes(size_t pgsz_bytes) { _max_page_size_bytes = pgsz_bytes; } + void set_max_page_size_bytes(size_t size_bytes) { _max_page_size_bytes = size_bytes; } /** * @brief Sets the maximum page size, in rows. */ - void set_max_page_size_rows(size_type pgsz_rows) { _max_page_size_rows = pgsz_rows; } + void set_max_page_size_rows(size_type size_rows) { _max_page_size_rows = size_rows; } /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1066,7 +1070,7 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets the maximum number of rows in output row groups. * - * @param val maximum number or rows + * @param val maximum number of rows * @return this for chaining. */ chunked_parquet_writer_options_builder& row_group_size_rows(size_type val) @@ -1076,7 +1080,9 @@ class chunked_parquet_writer_options_builder { } /** - * @brief Sets the maximum page size, in bytes. + * @brief Sets the maximum uncompressed page size, in bytes. Serves as a hint to the writer, + * and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, + * and will be adjusted to match if it is. * * @param val maximum page size * @return this for chaining. @@ -1088,9 +1094,9 @@ class chunked_parquet_writer_options_builder { } /** - * @brief Sets the maximum page size, in rows. + * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. * - * @param val maximum page size + * @param val maximum rows per page * @return this for chaining. */ chunked_parquet_writer_options_builder& max_page_size_rows(size_type val) From 045e73bedf0215fffd2194a41a59c375377b71f5 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 19 May 2022 13:25:27 -0700 Subject: [PATCH 09/15] ensure page size rows is less than or equal to row group size rows --- cpp/include/cudf/io/parquet.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 2cb26e2d0ce..874bd10f6ba 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -495,9 +495,10 @@ class parquet_writer_options { auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } /** - * @brief Returns maximum page size, in rows. + * @brief Returns maximum page size, in rows. If set larger than the row group size, then this will + * return the row group size. */ - auto get_max_page_size_rows() const { return _max_page_size_rows; } + auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } /** * @brief Sets partitions. @@ -730,6 +731,7 @@ class parquet_writer_options_builder { /** * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. + * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. * * @param val maximum rows per page * @return this for chaining. @@ -896,9 +898,10 @@ class chunked_parquet_writer_options { auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } /** - * @brief Returns maximum page size, in rows. + * @brief Returns maximum page size, in rows. If set larger than the row group size, then this will + * return the row group size. */ - auto get_max_page_size_rows() const { return _max_page_size_rows; } + auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } /** * @brief Sets metadata. @@ -1095,6 +1098,7 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. + * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. * * @param val maximum rows per page * @return this for chaining. From 42c955fba7ca33257c6b44e5bcb8fef4ceee46ab Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 19 May 2022 13:26:11 -0700 Subject: [PATCH 10/15] reimplement tests for consistent row group and page sizes --- cpp/tests/io/parquet_test.cpp | 46 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 4d791a1c46c..0443431ba52 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3184,31 +3184,33 @@ TEST_F(ParquetReaderTest, EmptyOutput) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } -/* temporarily remove this test. with page size being tunable, can no longer just test for - * row groups being smaller than 512KiB. ideally one would use the configured page size, but - * then that would require setting the page size parameter before the row group size, which - * seems odd. should probably test for consistent parameters when instantiating the writer. -TEST_F(ParquetWriterTest, RowGroupSizeInvalid) +TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) { const auto unused_table = std::make_unique
(); std::vector out_buffer; - EXPECT_THROW( - cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) - .row_group_size_rows(4999), - cudf::logic_error); - EXPECT_THROW( - cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) - .row_group_size_bytes(511 << 10), - cudf::logic_error); - - EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) - .row_group_size_rows(4999), - cudf::logic_error); - EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) - .row_group_size_bytes(511 << 10), - cudf::logic_error); -} -*/ + auto options = cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_bytes(128*1024) + .max_page_size_bytes(512*1024) + .row_group_size_rows(10000) + .max_page_size_rows(20000) + .build(); + EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); + EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); +} + +TEST_F(ParquetChunkedWriterTest, RowGroupPageSizeMatch) +{ + std::vector out_buffer; + + auto options = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .row_group_size_bytes(128*1024) + .max_page_size_bytes(512*1024) + .row_group_size_rows(10000) + .max_page_size_rows(20000) + .build(); + EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); + EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); +} CUDF_TEST_PROGRAM_MAIN() From 88d0c0652ca2793e58523cafdece2294736fd4fd Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 19 May 2022 13:31:21 -0700 Subject: [PATCH 11/15] more clang-format fixes --- cpp/include/cudf/io/parquet.hpp | 40 +++++++++++++++++++++------------ cpp/tests/io/parquet_test.cpp | 17 +++++++------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 874bd10f6ba..c752be45274 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -489,16 +489,22 @@ class parquet_writer_options { auto get_row_group_size_rows() const { return _row_group_size_rows; } /** - * @brief Returns the maximum uncompressed page size, in bytes. If set larger than the row group size, - * then this will return the row group size. + * @brief Returns the maximum uncompressed page size, in bytes. If set larger than the row group + * size, then this will return the row group size. */ - auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } + auto get_max_page_size_bytes() const + { + return std::min(_max_page_size_bytes, get_row_group_size_bytes()); + } /** - * @brief Returns maximum page size, in rows. If set larger than the row group size, then this will - * return the row group size. + * @brief Returns maximum page size, in rows. If set larger than the row group size, then this + * will return the row group size. */ - auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } + auto get_max_page_size_rows() const + { + return std::min(_max_page_size_rows, get_row_group_size_rows()); + } /** * @brief Sets partitions. @@ -717,8 +723,8 @@ class parquet_writer_options_builder { /** * @brief Sets the maximum uncompressed page size, in bytes. Serves as a hint to the writer, - * and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, - * and will be adjusted to match if it is. + * and can be exceeded under certain circumstances. Cannot be larger than the row group size in + * bytes, and will be adjusted to match if it is. * * @param val maximum page size * @return this for chaining. @@ -895,13 +901,19 @@ class chunked_parquet_writer_options { * @brief Returns maximum uncompressed page size, in bytes. If set larger than the row group size, * then this will return the row group size. */ - auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } + auto get_max_page_size_bytes() const + { + return std::min(_max_page_size_bytes, get_row_group_size_bytes()); + } /** - * @brief Returns maximum page size, in rows. If set larger than the row group size, then this will - * return the row group size. + * @brief Returns maximum page size, in rows. If set larger than the row group size, then this + * will return the row group size. */ - auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } + auto get_max_page_size_rows() const + { + return std::min(_max_page_size_rows, get_row_group_size_rows()); + } /** * @brief Sets metadata. @@ -1084,8 +1096,8 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets the maximum uncompressed page size, in bytes. Serves as a hint to the writer, - * and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, - * and will be adjusted to match if it is. + * and can be exceeded under certain circumstances. Cannot be larger than the row group size in + * bytes, and will be adjusted to match if it is. * * @param val maximum page size * @return this for chaining. diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 0443431ba52..0670c35af1b 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3189,9 +3189,10 @@ TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) const auto unused_table = std::make_unique
(); std::vector out_buffer; - auto options = cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) - .row_group_size_bytes(128*1024) - .max_page_size_bytes(512*1024) + auto options = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_bytes(128 * 1024) + .max_page_size_bytes(512 * 1024) .row_group_size_rows(10000) .max_page_size_rows(20000) .build(); @@ -3204,11 +3205,11 @@ TEST_F(ParquetChunkedWriterTest, RowGroupPageSizeMatch) std::vector out_buffer; auto options = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) - .row_group_size_bytes(128*1024) - .max_page_size_bytes(512*1024) - .row_group_size_rows(10000) - .max_page_size_rows(20000) - .build(); + .row_group_size_bytes(128 * 1024) + .max_page_size_bytes(512 * 1024) + .row_group_size_rows(10000) + .max_page_size_rows(20000) + .build(); EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); } From 63b86097159123009f81a6e474abb982df2c624c Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 20 May 2022 08:52:20 -0700 Subject: [PATCH 12/15] put back validation that row group/page sizes are at least the size of a fragment. --- cpp/include/cudf/io/parquet.hpp | 32 ++++++++++++++++++++++++++++---- cpp/tests/io/parquet_test.cpp | 22 ++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index c752be45274..3cba6bf114f 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -581,7 +581,13 @@ class parquet_writer_options { /** * @brief Sets the maximum row group size, in rows. */ - void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } + void set_row_group_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum row group size cannot be smaller than the fragment size, which is 5000 rows."); + _row_group_size_rows = size_rows; + } /** * @brief Sets the maximum uncompressed page size, in bytes. @@ -591,7 +597,13 @@ class parquet_writer_options { /** * @brief Sets the maximum page size, in rows. */ - void set_max_page_size_rows(size_type size_rows) { _max_page_size_rows = size_rows; } + void set_max_page_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum page size cannot be smaller than the fragment size, which is 5000 rows."); + _max_page_size_rows = size_rows; + } }; class parquet_writer_options_builder { @@ -964,7 +976,13 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum row group size, in rows. */ - void set_row_group_size_rows(size_type size_rows) { _row_group_size_rows = size_rows; } + void set_row_group_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum row group size cannot be smaller than the fragment size, which is 5000 rows."); + _row_group_size_rows = size_rows; + } /** * @brief Sets the maximum uncompressed page size, in bytes. @@ -974,7 +992,13 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum page size, in rows. */ - void set_max_page_size_rows(size_type size_rows) { _max_page_size_rows = size_rows; } + void set_max_page_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum page size cannot be smaller than the fragment size, which is 5000 rows."); + _max_page_size_rows = size_rows; + } /** * @brief creates builder to build chunked_parquet_writer_options. diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 0670c35af1b..f6f1b47d78e 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3184,6 +3184,28 @@ TEST_F(ParquetReaderTest, EmptyOutput) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(ParquetWriterTest, RowGroupSizeInvalid) +{ + const auto unused_table = std::make_unique
(); + std::vector out_buffer; + + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_rows(4999), + cudf::logic_error); + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .max_page_size_rows(4999), + cudf::logic_error); + + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .row_group_size_rows(4999), + cudf::logic_error); + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .max_page_size_rows(4999), + cudf::logic_error); +} + TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) { const auto unused_table = std::make_unique
(); From 46317eeeb11ff103d35a36e66261f062146afbd0 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 20 May 2022 09:05:02 -0700 Subject: [PATCH 13/15] change max_page_size to this_max_page_size to avoid confusion --- cpp/src/io/parquet/page_enc.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 0faba491e2d..cd1a997f736 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -336,15 +336,15 @@ __global__ void __launch_bounds__(128) ? frag_g.num_leaf_values * 2 // Assume worst-case of 2-bytes per dictionary index : frag_g.fragment_data_size; // TODO (dm): this convoluted logic to limit page size needs refactoring - uint32_t max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 - : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 - : 512 * 1024; + uint32_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 + : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 + : 512 * 1024; - // override max_page_size if target is smaller - if (max_page_size > max_page_size_bytes) max_page_size = max_page_size_bytes; + // override this_max_page_size if the requested size is smaller + if (this_max_page_size > max_page_size_bytes) this_max_page_size = max_page_size_bytes; if (num_rows >= ck_g.num_rows || - (values_in_page > 0 && (page_size + fragment_data_size > max_page_size)) || + (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) || rows_in_page > max_page_size_rows) { if (ck_g.use_dictionary) { page_size = From 1fe3fc740daed4c229171b936243a014e7d5b688 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 20 May 2022 16:19:06 -0700 Subject: [PATCH 14/15] set lower bound of 4KB on page size --- cpp/include/cudf/io/parquet.hpp | 28 ++++++++++++++++++++++++---- cpp/tests/io/parquet_test.cpp | 14 ++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 3cba6bf114f..d6812559e38 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -576,7 +576,13 @@ class parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. */ - void set_row_group_size_bytes(size_t size_bytes) { _row_group_size_bytes = size_bytes; } + void set_row_group_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS( + size_bytes >= 4 * 1024, + "The maximum row group size cannot be smaller than the minimum page size, which is 4KB."); + _row_group_size_bytes = size_bytes; + } /** * @brief Sets the maximum row group size, in rows. @@ -592,7 +598,11 @@ class parquet_writer_options { /** * @brief Sets the maximum uncompressed page size, in bytes. */ - void set_max_page_size_bytes(size_t size_bytes) { _max_page_size_bytes = size_bytes; } + void set_max_page_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS(size_bytes >= 4 * 1024, "The maximum page size cannot be smaller than 4KB."); + _max_page_size_bytes = size_bytes; + } /** * @brief Sets the maximum page size, in rows. @@ -971,7 +981,13 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. */ - void set_row_group_size_bytes(size_t size_bytes) { _row_group_size_bytes = size_bytes; } + void set_row_group_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS( + size_bytes >= 4 * 1024, + "The maximum row group size cannot be smaller than the minimum page size, which is 4KB."); + _row_group_size_bytes = size_bytes; + } /** * @brief Sets the maximum row group size, in rows. @@ -987,7 +1003,11 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum uncompressed page size, in bytes. */ - void set_max_page_size_bytes(size_t size_bytes) { _max_page_size_bytes = size_bytes; } + void set_max_page_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS(size_bytes >= 4 * 1024, "The maximum page size cannot be smaller than 4KB."); + _max_page_size_bytes = size_bytes; + } /** * @brief Sets the maximum page size, in rows. diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index f6f1b47d78e..820d8036455 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3197,6 +3197,14 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid) cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) .max_page_size_rows(4999), cudf::logic_error); + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_bytes(3 << 10), + cudf::logic_error); + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .max_page_size_bytes(3 << 10), + cudf::logic_error); EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) .row_group_size_rows(4999), @@ -3204,6 +3212,12 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid) EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) .max_page_size_rows(4999), cudf::logic_error); + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .row_group_size_bytes(3 << 10), + cudf::logic_error); + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .max_page_size_bytes(3 << 10), + cudf::logic_error); } TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) From 39d027ef87b6fafc7e64274d07340f921410fd11 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 20 May 2022 16:55:28 -0700 Subject: [PATCH 15/15] implement suggestion from review, but needed to change type of this_max_page_size to size_t --- cpp/src/io/parquet/page_enc.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index cd1a997f736..518eac6f90d 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -336,12 +336,12 @@ __global__ void __launch_bounds__(128) ? frag_g.num_leaf_values * 2 // Assume worst-case of 2-bytes per dictionary index : frag_g.fragment_data_size; // TODO (dm): this convoluted logic to limit page size needs refactoring - uint32_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 - : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 - : 512 * 1024; + size_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 + : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 + : 512 * 1024; // override this_max_page_size if the requested size is smaller - if (this_max_page_size > max_page_size_bytes) this_max_page_size = max_page_size_bytes; + this_max_page_size = min(this_max_page_size, max_page_size_bytes); if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) ||