Skip to content

Commit

Permalink
Add metadata_builder helper class (#13232)
Browse files Browse the repository at this point in the history
This PR introduces a helper class `metadata_builder` within the `cudf::detail` namespace, that allows `pack` and `contiguous_split` to create metadata for a packed buffer without having access to metadata internals (e.g. `serialized_column`).

The class makes it possible for callers to build metadata independent of the current implementations in `pack`, that are relying on `column_view`s being defined (actual valid cuDF columns). The chunked pack work will use this class to create metadata without needing to create a `column_view` in the first place, since it doesn't have a valid base pointer for the columns to be instantiated.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #13232
  • Loading branch information
abellina authored May 1, 2023
1 parent 04fc42d commit f27be56
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 21 deletions.
65 changes: 65 additions & 0 deletions cpp/include/cudf/detail/contiguous_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,70 @@ packed_columns pack(cudf::table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

// opaque implementation of `metadata_builder` since it needs to use
// `serialized_column`, which is only defined in pack.cpp
class metadata_builder_impl;

/**
* @brief Helper class that creates packed column metadata.
*
* This class is an interface to the opaque metadata that is used to
* describe `contiguous_split` and `pack` results.
*/
class metadata_builder {
public:
/**
* @brief Construct a new metadata_builder.
*
* @param num_root_columns is the number of top-level columns
*/
explicit metadata_builder(size_type const num_root_columns);

/**
* @brief Destructor that will be implemented as default, required because metadata_builder_impl
* is incomplete at this stage.
*/
~metadata_builder() = default;

/**
* @brief Add a column to this metadata builder.
*
* Callers must call this function for the parent column and followed by any children,
* in the order maintained in the column/column_view.
*
* Example: given a table with a nested column "a" with 2 children, and a non-nested column "b":
*
* 1) add_column_info_to_meta(col_a)
* 2) add_column_info_to_meta(col_a_child_1)
* 3) add_column_info_to_meta(col_a_child_2)
* 4) add_column_info_to_meta(col_b)
*
* @param col_type column data type
* @param col_size column row count
* @param col_null_count column null count
* @param data_offset data offset from the column's base ptr,
* or -1 for an empty column
* @param null_mask_offset null mask offset from the column's base ptr,
* or -1 for a column that isn't nullable
* @param num_children number of children columns
*/
void add_column_info_to_meta(data_type const col_type,
size_type const col_size,
size_type const col_null_count,
int64_t const data_offset,
int64_t const null_mask_offset,
size_type const num_children);

/**
* @brief Builds the opaque metadata for all added columns.
*
* @returns A vector containing the serialized column metadata
*/
std::vector<uint8_t> build() const;

private:
std::unique_ptr<metadata_builder_impl> impl;
};

} // namespace detail
} // namespace cudf
82 changes: 61 additions & 21 deletions cpp/src/copying/pack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@ column_view deserialize_column(serialized_column serial_column,
* @brief Build and add metadata for a column and all of it's children, recursively
*
*
* @param metadata Output vector of serialized_column metadata
* @param mb metadata_builder instance
* @param col Column to build metadata for
* @param base_ptr Base pointer for the entire contiguous buffer from which all columns
* were serialized into
* @param data_size Size of the incoming buffer
*/
void build_column_metadata(std::vector<serialized_column>& metadata,
void build_column_metadata(metadata_builder& mb,
column_view const& col,
uint8_t const* base_ptr,
size_t data_size)
Expand All @@ -126,12 +126,12 @@ void build_column_metadata(std::vector<serialized_column>& metadata,
int64_t const null_mask_offset = null_mask_ptr ? null_mask_ptr - base_ptr : -1;

// add metadata
metadata.emplace_back(
mb.add_column_info_to_meta(
col.type(), col.size(), col.null_count(), data_offset, null_mask_offset, col.num_children());

std::for_each(
col.child_begin(), col.child_end(), [&metadata, &base_ptr, &data_size](column_view const& col) {
build_column_metadata(metadata, col, base_ptr, data_size);
col.child_begin(), col.child_end(), [&mb, &base_ptr, &data_size](column_view const& col) {
build_column_metadata(mb, col, base_ptr, data_size);
});
}

Expand All @@ -156,27 +156,46 @@ std::vector<uint8_t> pack_metadata(ColumnIter begin,
uint8_t const* contiguous_buffer,
size_t buffer_size)
{
std::vector<serialized_column> metadata;
auto mb = metadata_builder(std::distance(begin, end));

// first metadata entry is a stub indicating how many total (top level) columns
// there are
metadata.emplace_back(
data_type{type_id::EMPTY}, static_cast<size_type>(std::distance(begin, end)), 0, -1, -1, 0);

std::for_each(begin, end, [&metadata, &contiguous_buffer, &buffer_size](column_view const& col) {
build_column_metadata(metadata, col, contiguous_buffer, buffer_size);
std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) {
build_column_metadata(mb, col, contiguous_buffer, buffer_size);
});

// convert to anonymous bytes
std::vector<uint8_t> metadata_bytes;
auto const metadata_begin = reinterpret_cast<uint8_t const*>(metadata.data());
std::copy(metadata_begin,
metadata_begin + (metadata.size() * sizeof(serialized_column)),
std::back_inserter(metadata_bytes));

return metadata_bytes;
return mb.build();
}

class metadata_builder_impl {
public:
metadata_builder_impl() = default;

void add_column_info_to_meta(data_type const col_type,
size_type const col_size,
size_type const col_null_count,
int64_t const data_offset,
int64_t const null_mask_offset,
size_type const num_children)
{
metadata.emplace_back(
col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children);
}

std::vector<uint8_t> build() const
{
// convert to anonymous bytes
std::vector<uint8_t> metadata_bytes;
auto const metadata_begin = reinterpret_cast<uint8_t const*>(metadata.data());
std::copy(metadata_begin,
metadata_begin + (metadata.size() * sizeof(detail::serialized_column)),
std::back_inserter(metadata_bytes));

return metadata_bytes;
}

private:
std::vector<detail::serialized_column> metadata;
};

/**
* @copydoc cudf::detail::unpack
*/
Expand Down Expand Up @@ -208,6 +227,27 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
return table_view{get_columns(num_columns)};
}

metadata_builder::metadata_builder(size_type const num_root_columns)
: impl(std::make_unique<metadata_builder_impl>())
{
// first metadata entry is a stub indicating how many total (top level) columns
// there are
impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0);
}

void metadata_builder::add_column_info_to_meta(data_type const col_type,
size_type const col_size,
size_type const col_null_count,
int64_t const data_offset,
int64_t const null_mask_offset,
size_type const num_children)
{
impl->add_column_info_to_meta(
col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children);
}

std::vector<uint8_t> metadata_builder::build() const { return impl->build(); }

} // namespace detail

/**
Expand Down

0 comments on commit f27be56

Please sign in to comment.