Skip to content

Commit

Permalink
Handle sliced struct/list columns properly in concatenate() bounds ch…
Browse files Browse the repository at this point in the history
…ecking. (#8760)

Fixes  #8748

Note:  `concatenate_tests.cpp` was renamed to `concatenate_tests.cu` because of the addition of some thrust calls.  

Existing overflow tests moved to `OverflowTest` section.  New tests specific to this PR are:

`Overflowtest.Presliced`
`OverflowTest.BigColumnsSmallSlices`

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: #8760
  • Loading branch information
nvdbaranec authored Jul 20, 2021
1 parent 456f088 commit b490a32
Show file tree
Hide file tree
Showing 4 changed files with 486 additions and 82 deletions.
142 changes: 109 additions & 33 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand Down Expand Up @@ -329,58 +330,133 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()

namespace {

void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);

/**
* @brief Functor for traversing child columns and recursively verifying concatenation
* bounds and types.
*/
class traverse_children {
public:
// nothing to do for simple types.
template <typename T>
void operator()(host_span<column_view const>, rmm::cuda_stream_view)
{
}

private:
// verify length of concatenated offsets.
void check_offsets_size(host_span<column_view const> cols)
{
// offsets. we can't just add up the total sizes of all offset child columns because each one
// has an extra value, regardless of the # of parent rows. So we have to add up the total # of
// rows in the base column and add 1 at the end
size_t const total_offset_count =
std::accumulate(cols.begin(),
cols.end(),
std::size_t{},
[](size_t a, auto const& b) -> size_t { return a + b.size(); }) +
1;
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_offset_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated offsets exceeds size_type range");
}
};

template <>
void traverse_children::operator()<cudf::string_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// verify offsets
check_offsets_size(cols);

// chars
size_t const total_char_count = std::accumulate(
cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
strings_column_view scv(b);
return a + (b.is_empty()
? 0
: cudf::detail::get_value<offset_type>(
scv.offsets(), scv.offset() + b.size(), stream) -
cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream));
});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated chars exceeds size_type range");
}

template <>
void traverse_children::operator()<cudf::struct_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = cols.front().num_children();
std::vector<column_view> nth_children;
nth_children.reserve(cols.size());
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::transform(cols.begin(),
cols.end(),
std::back_inserter(nth_children),
[child_index, stream](column_view const& col) {
structs_column_view scv(col);
return scv.get_sliced_child(child_index);
});

bounds_and_type_check(nth_children, stream);
nth_children.clear();
});
}

template <>
void traverse_children::operator()<cudf::list_view>(host_span<column_view const> cols,
rmm::cuda_stream_view stream)
{
// verify offsets
check_offsets_size(cols);

// recurse into the child columns
std::vector<column_view> nth_children;
nth_children.reserve(cols.size());
std::transform(
cols.begin(), cols.end(), std::back_inserter(nth_children), [stream](column_view const& col) {
lists_column_view lcv(col);
return lcv.get_sliced_child(stream);
});
bounds_and_type_check(nth_children, stream);
}

/**
* @brief Verifies that the sum of the sizes of all the columns to be concatenated
* will not exceed the max value of size_type, and verifies all column types match
*
* @param begin Beginning of range of columns to check
* @param end End of range of columns to check
* @param columns_to_concat Span of columns to check
*
* @throws cudf::logic_error if the total length of the concatenated columns would
* exceed the max value of size_type
*
* @throws cudf::logic_error if all of the input column types don't match
*/
template <typename ColIter>
void bounds_and_type_check(ColIter begin, ColIter end)
void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(std::all_of(begin,
end,
[expected_type = (*begin).type()](auto const& c) {
CUDF_EXPECTS(std::all_of(cols.begin(),
cols.end(),
[expected_type = cols.front().type()](auto const& c) {
return c.type() == expected_type;
}),
"Type mismatch in columns to concatenate.");

// total size of all concatenated rows
size_t const total_row_count =
std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
CUDF_EXPECTS(total_row_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_row_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated rows exceeds size_type range");

// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = (*begin).num_children();
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::vector<column_view> nth_children;
nth_children.reserve(std::distance(begin, end));

// we cannot do this via a transform iterator + std::copy_if because some columns
// can have no children if they are empty. so if we had 3 input string columns
// and 1 of them was empty, 2 of them would have 2 children, and 1 of them would have
// 0 children. so it is not safe to index col.child() directly.
std::for_each(begin, end, [child_index, &nth_children](column_view const& col) {
if (col.num_children() <= child_index) {
CUDF_EXPECTS(col.num_children() == 0,
"Encountered a child column with an unexpected # of children");
} else {
nth_children.push_back(col.child(child_index));
}
});

bounds_and_type_check(nth_children.begin(), nth_children.end());
});
// traverse children
cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream);
}

} // anonymous namespace
Expand All @@ -393,7 +469,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(columns_to_concat.begin(), columns_to_concat.end());
bounds_and_type_check(columns_to_concat, stream);

if (std::all_of(columns_to_concat.begin(), columns_to_concat.end(), [](column_view const& c) {
return c.is_empty();
Expand Down Expand Up @@ -428,7 +504,7 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
[i](auto const& t) { return t.column(i); });

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(cols.begin(), cols.end());
bounds_and_type_check(cols, stream);
concat_columns.emplace_back(detail::concatenate(cols, stream, mr));
}
return std::make_unique<table>(std::move(concat_columns));
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ ConfigureTest(SORT_TEST
###################################################################################################
# - copying tests ---------------------------------------------------------------------------------
ConfigureTest(COPYING_TEST
copying/concatenate_tests.cpp
copying/concatenate_tests.cu
copying/copy_if_else_nested_tests.cpp
copying/copy_range_tests.cpp
copying/copy_tests.cu
Expand Down
Loading

0 comments on commit b490a32

Please sign in to comment.