Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metadata_builder helper class #13232

Merged
merged 7 commits into from
May 1, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions cpp/include/cudf/detail/contiguous_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,70 @@ packed_columns pack(cudf::table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

// opaque implementation of `metadata_builder` since it needs to use
// `serialized_column`, which is only defined in pack.cpp
class metadata_builder_impl;

/**
* @brief Helper class that creates packed column metadata.
*
* This class is an interface to the opaque metadata that is used to
* describe `contiguous_split` and `pack` results.
*/
class metadata_builder {
public:
/**
* @brief Construct a new metadata_builder.
*
* @param num_root_columns is the number of top-level columns
*/
explicit metadata_builder(size_type num_root_columns);
abellina marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Destructor that will be implemented as default, required because metadata_builder_impl
* is incomplete at this stage.
*/
~metadata_builder() = default;

/**
* @brief Add a column to this metadata builder.
*
* Callers must call this function for the parent column and followed by any children,
* in the order maintained in the column/column_view.
*
* Example: given a table with a nested column "a" with 2 children, and a non-nested column "b":
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
*
* 1) add_column_to_meta(col_a)
* 2) add_column_to_meta(col_a_child_1)
* 3) add_column_to_meta(col_a_child_2)
* 4) add_column_to_meta(col_b)
*
* @param col_type column data type
* @param col_size column row count
* @param col_null_count column null count
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
* @param data_offset data offset from the column's base ptr,
* or -1 for an empty column
* @param null_mask_offset null mask offset from the column's base ptr,
* or -1 for a column that isn't nullable
* @param num_children number of children columns
*/
void add_column_to_meta(data_type col_type,
size_type col_size,
size_type col_null_count,
int64_t data_offset,
int64_t null_mask_offset,
size_type num_children);
abellina marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Builds the opaque metadata for all added columns.
*
* @returns A vector containing the serialized column metadata
*/
std::vector<uint8_t> build();

private:
std::unique_ptr<metadata_builder_impl> impl;
};

} // namespace detail
} // namespace cudf
80 changes: 59 additions & 21 deletions cpp/src/copying/pack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@ column_view deserialize_column(serialized_column serial_column,
* @brief Build and add metadata for a column and all of it's children, recursively
*
*
* @param metadata Output vector of serialized_column metadata
* @param mb metadata_builder instance
* @param col Column to build metadata for
* @param base_ptr Base pointer for the entire contiguous buffer from which all columns
* were serialized into
* @param data_size Size of the incoming buffer
*/
void build_column_metadata(std::vector<serialized_column>& metadata,
void build_column_metadata(metadata_builder& mb,
column_view const& col,
uint8_t const* base_ptr,
size_t data_size)
Expand All @@ -126,12 +126,12 @@ void build_column_metadata(std::vector<serialized_column>& metadata,
int64_t const null_mask_offset = null_mask_ptr ? null_mask_ptr - base_ptr : -1;

// add metadata
metadata.emplace_back(
mb.add_column_to_meta(
col.type(), col.size(), col.null_count(), data_offset, null_mask_offset, col.num_children());

std::for_each(
col.child_begin(), col.child_end(), [&metadata, &base_ptr, &data_size](column_view const& col) {
build_column_metadata(metadata, col, base_ptr, data_size);
col.child_begin(), col.child_end(), [&mb, &base_ptr, &data_size](column_view const& col) {
build_column_metadata(mb, col, base_ptr, data_size);
});
}

Expand All @@ -156,27 +156,46 @@ std::vector<uint8_t> pack_metadata(ColumnIter begin,
uint8_t const* contiguous_buffer,
size_t buffer_size)
{
std::vector<serialized_column> metadata;
auto mb = metadata_builder(std::distance(begin, end));

// first metadata entry is a stub indicating how many total (top level) columns
// there are
metadata.emplace_back(
data_type{type_id::EMPTY}, static_cast<size_type>(std::distance(begin, end)), 0, -1, -1, 0);

std::for_each(begin, end, [&metadata, &contiguous_buffer, &buffer_size](column_view const& col) {
build_column_metadata(metadata, col, contiguous_buffer, buffer_size);
std::for_each(begin, end, [&mb, &contiguous_buffer, &buffer_size](column_view const& col) {
build_column_metadata(mb, col, contiguous_buffer, buffer_size);
});

// convert to anonymous bytes
std::vector<uint8_t> metadata_bytes;
auto const metadata_begin = reinterpret_cast<uint8_t const*>(metadata.data());
std::copy(metadata_begin,
metadata_begin + (metadata.size() * sizeof(serialized_column)),
std::back_inserter(metadata_bytes));

return metadata_bytes;
return mb.build();
}

class metadata_builder_impl {
public:
metadata_builder_impl() = default;

void add_column_to_meta(data_type col_type,
abellina marked this conversation as resolved.
Show resolved Hide resolved
size_type col_size,
size_type col_null_count,
int64_t data_offset,
int64_t null_mask_offset,
size_type num_children)
abellina marked this conversation as resolved.
Show resolved Hide resolved
{
metadata.emplace_back(
col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children);
}

std::vector<uint8_t> build()
abellina marked this conversation as resolved.
Show resolved Hide resolved
{
// convert to anonymous bytes
std::vector<uint8_t> metadata_bytes;
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
auto const metadata_begin = reinterpret_cast<uint8_t const*>(metadata.data());
std::copy(metadata_begin,
metadata_begin + (metadata.size() * sizeof(detail::serialized_column)),
std::back_inserter(metadata_bytes));

return metadata_bytes;
}

private:
std::vector<detail::serialized_column> metadata;
};

/**
* @copydoc cudf::detail::unpack
*/
Expand Down Expand Up @@ -208,6 +227,25 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
return table_view{get_columns(num_columns)};
}

metadata_builder::metadata_builder(size_type num_root_columns)
abellina marked this conversation as resolved.
Show resolved Hide resolved
: impl(std::make_unique<metadata_builder_impl>())
{
impl->add_column_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0);
abellina marked this conversation as resolved.
Show resolved Hide resolved
}

void metadata_builder::add_column_to_meta(data_type col_type,
size_type col_size,
size_type col_null_count,
int64_t data_offset,
int64_t null_mask_offset,
size_type num_children)
abellina marked this conversation as resolved.
Show resolved Hide resolved
{
impl->add_column_to_meta(
col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children);
}

std::vector<uint8_t> metadata_builder::build() { return impl->build(); }

} // namespace detail

/**
Expand Down