From c78b7da284b3ef0353646973a8c5446452e0e29c Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Apr 2023 22:38:41 -0500 Subject: [PATCH 1/6] Add metadata_builder helper class Signed-off-by: Alessandro Bellina --- cpp/include/cudf/detail/contiguous_split.hpp | 65 +++++++++++++++ cpp/src/copying/pack.cpp | 83 +++++++++++++++----- 2 files changed, 127 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index 6abaaabdd3c..d9d689a204b 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -44,5 +44,70 @@ packed_columns pack(cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +// opaque implementation of `metadata_builder` since it needs to use +// `serialized_column`, which is only defined in pack.cpp +class metadata_builder_impl; + +/** + * @brief Helper class that creates packed column metadata. + * + * This class is an interface to the opaque metadata that is used to + * describe `contiguous_split` and `pack` results. + */ +class metadata_builder { + public: + /** + * @brief Construct a new metadata_builder. + * + * @param num_root_columns is the number of top-level columns + */ + explicit metadata_builder(size_type num_root_columns); + + /** + * @brief Destructor that will be implemented as default, required because metadata_builder_impl + * is incomplete at this stage. + */ + ~metadata_builder(); + + /** + * @brief Add a column to this metadata builder. + * + * Callers must call this function for the parent column and followed by any children, + * in the order maintained in the column/column_view. + * + * Example: given a table with a nested column "a" with 2 children, and a non-nested column "b": + * + * 1) add_column_to_meta(col_a) + * 2) add_column_to_meta(col_a_child_1) + * 3) add_column_to_meta(col_a_child_2) + * 4) add_column_to_meta(col_b) + * + * @param col_type column data type + * @param col_size column row count + * @param col_null_count column null count + * @param data_offset data offset from the column's base ptr, + * or -1 for an empty column + * @param null_mask_offset null mask offset from the column's base ptr, + * or -1 for a column that isn't nullable + * @param num_children number of chilren columns + */ + void add_column_to_meta(data_type col_type, + size_type col_size, + size_type col_null_count, + int64_t data_offset, + int64_t null_mask_offset, + size_type num_children); + + /** + * @brief Builds the opaque metadata for all added columns. + * + * @returns A vector containing the serialized column metadata + */ + std::vector build(); + + private: + std::unique_ptr impl; +}; + } // namespace detail } // namespace cudf diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 5884fcc100a..9eb38072abd 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -98,13 +98,13 @@ column_view deserialize_column(serialized_column serial_column, * @brief Build and add metadata for a column and all of it's children, recursively * * - * @param metadata Output vector of serialized_column metadata + * @param mb metadata_builder instance * @param col Column to build metadata for * @param base_ptr Base pointer for the entire contiguous buffer from which all columns * were serialized into * @param data_size Size of the incoming buffer */ -void build_column_metadata(std::vector& metadata, +void build_column_metadata(metadata_builder& mb, column_view const& col, uint8_t const* base_ptr, size_t data_size) @@ -126,12 +126,12 @@ void build_column_metadata(std::vector& metadata, int64_t const null_mask_offset = null_mask_ptr ? null_mask_ptr - base_ptr : -1; // add metadata - metadata.emplace_back( + mb.add_column_to_meta( col.type(), col.size(), col.null_count(), data_offset, null_mask_offset, col.num_children()); std::for_each( - col.child_begin(), col.child_end(), [&metadata, &base_ptr, &data_size](column_view const& col) { - build_column_metadata(metadata, col, base_ptr, data_size); + col.child_begin(), col.child_end(), [&mb, &base_ptr, &data_size](column_view const& col) { + build_column_metadata(mb, col, base_ptr, data_size); }); } @@ -156,27 +156,46 @@ std::vector pack_metadata(ColumnIter begin, uint8_t const* contiguous_buffer, size_t buffer_size) { - std::vector metadata; + auto mb = metadata_builder(std::distance(begin, end)); - // first metadata entry is a stub indicating how many total (top level) columns - // there are - metadata.emplace_back( - data_type{type_id::EMPTY}, static_cast(std::distance(begin, end)), 0, -1, -1, 0); - - std::for_each(begin, end, [&metadata, &contiguous_buffer, &buffer_size](column_view const& col) { - build_column_metadata(metadata, col, contiguous_buffer, buffer_size); + std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) { + build_column_metadata(mb, col, contiguous_buffer, buffer_size); }); - // convert to anonymous bytes - std::vector metadata_bytes; - auto const metadata_begin = reinterpret_cast(metadata.data()); - std::copy(metadata_begin, - metadata_begin + (metadata.size() * sizeof(serialized_column)), - std::back_inserter(metadata_bytes)); - - return metadata_bytes; + return mb.build(); } +class metadata_builder_impl { + public: + metadata_builder_impl() = default; + + void add_column_to_meta(data_type col_type, + size_type col_size, + size_type col_null_count, + int64_t data_offset, + int64_t null_mask_offset, + size_type num_children) + { + metadata.emplace_back( + col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children); + } + + std::vector build() + { + // convert to anonymous bytes + std::vector metadata_bytes; + auto const metadata_begin = reinterpret_cast(metadata.data()); + std::copy(metadata_begin, + metadata_begin + (metadata.size() * sizeof(detail::serialized_column)), + std::back_inserter(metadata_bytes)); + + return metadata_bytes; + } + + private: + std::vector metadata; +}; + /** * @copydoc cudf::detail::unpack */ @@ -208,6 +227,28 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) return table_view{get_columns(num_columns)}; } +metadata_builder::metadata_builder(size_type num_root_columns) + : impl(std::make_unique()) +{ + impl->add_column_to_meta( + data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); +} + +metadata_builder::~metadata_builder() = default; + +void metadata_builder::add_column_to_meta(data_type col_type, + size_type col_size, + size_type col_null_count, + int64_t data_offset, + int64_t null_mask_offset, + size_type num_children) +{ + impl->add_column_to_meta( + col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children); +} + +std::vector metadata_builder::build() { return impl->build(); } + } // namespace detail /** From c50e2bd14a42ab2840a6e27ada7688aed60201b6 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 26 Apr 2023 16:52:39 -0500 Subject: [PATCH 2/6] Fix code style issues and typo --- cpp/include/cudf/detail/contiguous_split.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index d9d689a204b..ea1cd8c13ff 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -89,7 +89,7 @@ class metadata_builder { * or -1 for an empty column * @param null_mask_offset null mask offset from the column's base ptr, * or -1 for a column that isn't nullable - * @param num_children number of chilren columns + * @param num_children number of children columns */ void add_column_to_meta(data_type col_type, size_type col_size, From b531b2a03ec6b600b89aec8f93416231ad2bb4ec Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 26 Apr 2023 17:04:12 -0500 Subject: [PATCH 3/6] Fix style issue in pack.cpp --- cpp/src/copying/pack.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 9eb38072abd..a9c248f6d76 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -230,8 +230,7 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) metadata_builder::metadata_builder(size_type num_root_columns) : impl(std::make_unique()) { - impl->add_column_to_meta( - data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); + impl->add_column_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } metadata_builder::~metadata_builder() = default; From 7cfa143f755b839c8fb4e781c98d4e116b1ff9bd Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 27 Apr 2023 17:01:57 -0500 Subject: [PATCH 4/6] Specify default metadata_builder destructor in header --- cpp/include/cudf/detail/contiguous_split.hpp | 2 +- cpp/src/copying/pack.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index ea1cd8c13ff..00169a8d3b0 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -67,7 +67,7 @@ class metadata_builder { * @brief Destructor that will be implemented as default, required because metadata_builder_impl * is incomplete at this stage. */ - ~metadata_builder(); + ~metadata_builder() = default; /** * @brief Add a column to this metadata builder. diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index a9c248f6d76..73398710a90 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -233,8 +233,6 @@ metadata_builder::metadata_builder(size_type num_root_columns) impl->add_column_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } -metadata_builder::~metadata_builder() = default; - void metadata_builder::add_column_to_meta(data_type col_type, size_type col_size, size_type col_null_count, From dbecac564149430a90635442c5fc8041381bb651 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 28 Apr 2023 08:53:26 -0500 Subject: [PATCH 5/6] Address review comments --- cpp/include/cudf/detail/contiguous_split.hpp | 24 ++++++------- cpp/src/copying/pack.cpp | 36 +++++++++++--------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index 00169a8d3b0..4c6d19739cf 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -61,7 +61,7 @@ class metadata_builder { * * @param num_root_columns is the number of top-level columns */ - explicit metadata_builder(size_type num_root_columns); + explicit metadata_builder(size_type const num_root_columns); /** * @brief Destructor that will be implemented as default, required because metadata_builder_impl @@ -77,10 +77,10 @@ class metadata_builder { * * Example: given a table with a nested column "a" with 2 children, and a non-nested column "b": * - * 1) add_column_to_meta(col_a) - * 2) add_column_to_meta(col_a_child_1) - * 3) add_column_to_meta(col_a_child_2) - * 4) add_column_to_meta(col_b) + * 1) add_column_info_to_meta(col_a) + * 2) add_column_info_to_meta(col_a_child_1) + * 3) add_column_info_to_meta(col_a_child_2) + * 4) add_column_info_to_meta(col_b) * * @param col_type column data type * @param col_size column row count @@ -91,19 +91,19 @@ class metadata_builder { * or -1 for a column that isn't nullable * @param num_children number of children columns */ - void add_column_to_meta(data_type col_type, - size_type col_size, - size_type col_null_count, - int64_t data_offset, - int64_t null_mask_offset, - size_type num_children); + void add_column_info_to_meta(data_type const col_type, + size_type const col_size, + size_type const col_null_count, + int64_t const data_offset, + int64_t const null_mask_offset, + size_type const num_children); /** * @brief Builds the opaque metadata for all added columns. * * @returns A vector containing the serialized column metadata */ - std::vector build(); + std::vector build() const; private: std::unique_ptr impl; diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 73398710a90..ca2bbd9374f 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -126,7 +126,7 @@ void build_column_metadata(metadata_builder& mb, int64_t const null_mask_offset = null_mask_ptr ? null_mask_ptr - base_ptr : -1; // add metadata - mb.add_column_to_meta( + mb.add_column_info_to_meta( col.type(), col.size(), col.null_count(), data_offset, null_mask_offset, col.num_children()); std::for_each( @@ -169,18 +169,18 @@ class metadata_builder_impl { public: metadata_builder_impl() = default; - void add_column_to_meta(data_type col_type, - size_type col_size, - size_type col_null_count, - int64_t data_offset, - int64_t null_mask_offset, - size_type num_children) + void add_column_info_to_meta(data_type const col_type, + size_type const col_size, + size_type const col_null_count, + int64_t const data_offset, + int64_t const null_mask_offset, + size_type const num_children) { metadata.emplace_back( col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children); } - std::vector build() + std::vector build() const { // convert to anonymous bytes std::vector metadata_bytes; @@ -230,21 +230,23 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) metadata_builder::metadata_builder(size_type num_root_columns) : impl(std::make_unique()) { - impl->add_column_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); + // first metadata entry is a stub indicating how many total (top level) columns + // there are + impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } -void metadata_builder::add_column_to_meta(data_type col_type, - size_type col_size, - size_type col_null_count, - int64_t data_offset, - int64_t null_mask_offset, - size_type num_children) +void metadata_builder::add_column_info_to_meta(data_type const col_type, + size_type const col_size, + size_type const col_null_count, + int64_t const data_offset, + int64_t const null_mask_offset, + size_type const num_children) { - impl->add_column_to_meta( + impl->add_column_info_to_meta( col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children); } -std::vector metadata_builder::build() { return impl->build(); } +std::vector metadata_builder::build() const { return impl->build(); } } // namespace detail From 5ed51ef748348da6e4f63dfb483c842971b598d7 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 28 Apr 2023 08:58:05 -0500 Subject: [PATCH 6/6] size_type const in metadata_builder constructor impl Co-authored-by: Mike Wilson --- cpp/src/copying/pack.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index ca2bbd9374f..bac9aac1886 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -227,7 +227,7 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) return table_view{get_columns(num_columns)}; } -metadata_builder::metadata_builder(size_type num_root_columns) +metadata_builder::metadata_builder(size_type const num_root_columns) : impl(std::make_unique()) { // first metadata entry is a stub indicating how many total (top level) columns