Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle sliced struct/list columns properly in concatenate() bounds checking. #8760

Merged
merged 5 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 109 additions & 32 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand Down Expand Up @@ -329,58 +330,134 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()

namespace {

void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);

/**
* @brief Functor for traversing child columns and recursively verifying concatenation
* bounds and types.
*/
class traverse_children {
public:
// nothing to do for simple types.
template <typename T>
void operator()(host_span<column_view const>, rmm::cuda_stream_view)
{
}

private:
// verify length of concatenated offsets.
void check_offsets_size(host_span<column_view const> cols)
{
// offsets. we can't just add up the total sizes of all offset child columns because each one
// has an extra value, regardless of the # of parent rows. So we have to add up the total # of
// rows in the base column and add 1 at the end
size_t const total_offset_count =
std::accumulate(cols.begin(),
cols.end(),
std::size_t{},
[](size_t a, auto const& b) -> size_t { return a + b.size(); }) +
1;
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(
total_offset_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
"Total number of concatenated offsets exceeds size_type range");
}
};

template <>
void traverse_children::operator()<cudf::string_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// verify offsets
check_offsets_size(cols);

// chars
size_t const total_char_count = std::accumulate(
cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
strings_column_view scv(b);
return a + (b.is_empty()
? 0
: cudf::detail::get_value<offset_type>(
scv.offsets(), scv.offset() + b.size(), stream) -
cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream));
Comment on lines +379 to +381
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each get_value implicitly calls stream sync. So we have 2 syncs here. I wonder if we need to manually call cudaMemcpy twice, then justs do 1 stream sync for better performance.

});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_char_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
"Total number of concatenated chars exceeds size_type range");
}

template <>
void traverse_children::operator()<cudf::struct_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = cols.front().num_children();
std::vector<column_view> nth_children;
nth_children.reserve(cols.size());
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::transform(cols.begin(),
cols.end(),
std::back_inserter(nth_children),
[child_index, stream](column_view const& col) {
structs_column_view scv(col);
return scv.get_sliced_child(child_index);
});

bounds_and_type_check(nth_children, stream);
nth_children.clear();
});
}

template <>
void traverse_children::operator()<cudf::list_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// verify offsets
check_offsets_size(cols);

// recurse into the child columns
std::vector<column_view> nth_children;
nth_children.reserve(cols.size());
std::transform(
cols.begin(), cols.end(), std::back_inserter(nth_children), [stream](column_view const& col) {
lists_column_view lcv(col);
return lcv.get_sliced_child(stream);
});
bounds_and_type_check(nth_children, stream);
}

/**
* @brief Verifies that the sum of the sizes of all the columns to be concatenated
* will not exceed the max value of size_type, and verifies all column types match
*
* @param begin Beginning of range of columns to check
* @param end End of range of columns to check
* @param columns_to_concat Span of columns to check
*
* @throws cudf::logic_error if the total length of the concatenated columns would
* exceed the max value of size_type
*
* @throws cudf::logic_error if all of the input column types don't match
*/
template <typename ColIter>
void bounds_and_type_check(ColIter begin, ColIter end)
void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(std::all_of(begin,
end,
[expected_type = (*begin).type()](auto const& c) {
CUDF_EXPECTS(std::all_of(cols.begin(),
cols.end(),
[expected_type = cols.front().type()](auto const& c) {
return c.type() == expected_type;
}),
"Type mismatch in columns to concatenate.");

// total size of all concatenated rows
size_t const total_row_count =
std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_row_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
"Total number of concatenated rows exceeds size_type range");

// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = (*begin).num_children();
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::vector<column_view> nth_children;
nth_children.reserve(std::distance(begin, end));

// we cannot do this via a transform iterator + std::copy_if because some columns
// can have no children if they are empty. so if we had 3 input string columns
// and 1 of them was empty, 2 of them would have 2 children, and 1 of them would have
// 0 children. so it is not safe to index col.child() directly.
std::for_each(begin, end, [child_index, &nth_children](column_view const& col) {
if (col.num_children() <= child_index) {
CUDF_EXPECTS(col.num_children() == 0,
"Encountered a child column with an unexpected # of children");
} else {
nth_children.push_back(col.child(child_index));
}
});

bounds_and_type_check(nth_children.begin(), nth_children.end());
});
// traverse children
cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream);
}

} // anonymous namespace
Expand All @@ -393,7 +470,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(columns_to_concat.begin(), columns_to_concat.end());
bounds_and_type_check(columns_to_concat, stream);

if (std::all_of(columns_to_concat.begin(), columns_to_concat.end(), [](column_view const& c) {
return c.is_empty();
Expand Down Expand Up @@ -428,7 +505,7 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
[i](auto const& t) { return t.column(i); });

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(cols.begin(), cols.end());
bounds_and_type_check(cols, stream);
concat_columns.emplace_back(detail::concatenate(cols, stream, mr));
}
return std::make_unique<table>(std::move(concat_columns));
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ ConfigureTest(SORT_TEST
###################################################################################################
# - copying tests ---------------------------------------------------------------------------------
ConfigureTest(COPYING_TEST
copying/concatenate_tests.cpp
copying/concatenate_tests.cu
copying/copy_if_else_nested_tests.cpp
copying/copy_range_tests.cpp
copying/copy_tests.cu
Expand Down
Loading