Skip to content

Commit

Permalink
Handle nested string columns with no children in contiguous_split.(#6864
Browse files Browse the repository at this point in the history
)

Fixes a specific corner case:   String columns with no children (a special form of empty string column that can happen) that are nested inside a list (or struct) column.

This would be useful as a 0.17 PR but isn't strictly necessary, since it's pretty late.

Edit:  Updated the fix so that it always includes a record for src/dst buffers, even if they are of size 0 or have null data pointers.  The previous method that only checked the data pointer being null was unclean and didn't handle a particularly strange case that came up with the Spark plugin:  the plugin was reconstructing columns (on the receiver side of a shuffle) that had size 0 but a non-null data pointer.  This is technically legal but super weird.

Authors:
  - Dave Baranec <[email protected]>
  - Karthikeyan <[email protected]>

Approvers:
  - Karthikeyan (@karthikeyann)
  - Alfred Xu (@sperlingxx)
  - Karthikeyan (@karthikeyann)
  - Alfred Xu (@sperlingxx)
  - Karthikeyan (@karthikeyann)
  - Devavret Makkar (@devavret)

URL: #6864
  • Loading branch information
nvdbaranec authored Jan 12, 2021
1 parent 4da8312 commit d791e20
Show file tree
Hide file tree
Showing 3 changed files with 269 additions and 47 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@
- PR #6855 Fix `.str.replace_with_backrefs` docs examples
- PR #6853 Fix contiguous split of null string columns
- PR #6861 Fix compile error in type_dispatch_benchmark.cu
- PR #6864 Handle contiguous_split corner case for nested string columns with no children
- PR #6869 Avoid dependency resolution failure in latest version of pip by explicitly specifying versions for dask and distributed
- PR #6806 Force install of local conda artifacts
- PR #6887 Fix typo and `0-d` numpy array handling in binary operation
Expand Down
120 changes: 73 additions & 47 deletions cpp/src/copying/contiguous_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ template <typename InputIter>
size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
{
return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) {
auto const num_buffers = (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0);
auto const num_buffers = 1 + (col.nullable() ? 1 : 0);
return stack_size + (offset_depth * num_buffers) +
compute_offset_stack_size(
col.child_begin(), col.child_end(), offset_depth + is_offset_type(col.type().id()));
Expand Down Expand Up @@ -300,11 +300,11 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf
out_buf++;
}
// NOTE: we're always returning the base pointer here. column-level offset is accounted
// for later.
if (col.head() != nullptr) {
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;
}
// for later. Also, for some column types (string, list, struct) this pointer will be null
// because there is no associated data with the root column.
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;

out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf);
});
return out_buf;
Expand All @@ -329,8 +329,7 @@ template <typename InputIter>
size_type count_src_bufs(InputIter begin, InputIter end)
{
auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
return (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0) +
count_src_bufs(col.child_begin(), col.child_end());
return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end());
});
return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
}
Expand Down Expand Up @@ -417,38 +416,53 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::string_vi
int parent_offset_index,
int offset_depth)
{
strings_column_view scv(col);

if (col.nullable()) {
std::tie(current, offset_stack_pos) =
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

auto offset_col = current;
// string columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRING, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// info for the offsets buffer
*current = src_buf_info(type_id::INT32,
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());
// string columns don't necessarily have children
if (col.num_children() > 0) {
CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column");
strings_column_view scv(col);

// info for the offsets buffer
auto offset_col = current;
CUDF_EXPECTS(scv.offsets().nullable() == false, "Encountered nullable string offsets column");
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the string column
// has been created with empty_like().
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());

// prevent appending buf_info for non-exist chars buffer
if (scv.chars_size() > 0) {
current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

// prevent appending buf_info for non-existent chars buffer
CUDF_EXPECTS(scv.chars().nullable() == false, "Encountered nullable string chars column");

// info for the chars buffer
*current = src_buf_info(
type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;
}

return {current + 1, offset_stack_pos + offset_depth};
return {current, offset_stack_pos};
}

template <>
Expand All @@ -466,10 +480,20 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

auto offset_col = current;
// list columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::LIST, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed list column");

// info for the offsets buffer
*current = src_buf_info(type_id::INT32,
auto offset_col = current;
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the lists column
// has been created with empty_like().
lcv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
Expand All @@ -478,7 +502,7 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

Expand All @@ -504,6 +528,13 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::struct_vi
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

// struct columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRUCT, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// recurse on children
return setup_source_buf_info(col.child_begin(),
col.child_end(),
Expand Down Expand Up @@ -586,29 +617,19 @@ BufInfo build_output_columns(InputIter begin,
}
return std::make_pair(static_cast<bitmask_type const*>(nullptr), 0);
}();
uint8_t const* data_ptr;
size_type size;
std::tie(data_ptr, size) = [&]() {
if (src.head()) {
auto const ptr = base_ptr + current_info->dst_offset;
// if we have data, num_elements will always be the correct size.
// we don't want to use num_rows because if we are an offset column, num_rows
// represents the # of rows of our owning parent. num_elements always represents
// the proper size for this column
auto const size = current_info->num_elements;
++current_info;
return std::make_pair(ptr, size);
}
// Parent columns w/o data (e.g., strings, lists) don't have an associated `dst_buf_info`,
// therefore, use the first child's info if it has at least one child. Their num_rows value
// will be correct (also see comment above)
auto const size = (src.num_children() == 0) ? 0 : current_info->num_rows;
return std::make_pair(static_cast<uint8_t const*>(nullptr), size);
}();

// size/data pointer for the column
auto const size = current_info->num_elements;
uint8_t const* data_ptr =
size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset;
++current_info;

// children
auto children = std::vector<column_view>{};
children.reserve(src.num_children());
current_info = build_output_columns(
src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);

return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
});

Expand Down Expand Up @@ -819,8 +840,13 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int row_end = d_indices[split_index + 1] + src_info.column_offset;
while (stack_size > 0) {
stack_size--;
row_start = d_src_buf_info[offset_stack[stack_size]].offsets[row_start];
row_end = d_src_buf_info[offset_stack[stack_size]].offsets[row_end];
auto const offsets = d_src_buf_info[offset_stack[stack_size]].offsets;
// this case can happen when you have empty string or list columns constructed with
// empty_like()
if (offsets != nullptr) {
row_start = offsets[row_start];
row_end = offsets[row_end];
}
}

// final row indices and row count
Expand All @@ -832,7 +858,7 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int const bit_shift = src_info.is_validity ? row_start % 32 : 0;
// # of rows isn't necessarily the same as # of elements to be copied.
auto const num_elements = [&]() {
if (src_info.offsets != nullptr) {
if (src_info.offsets != nullptr && num_rows > 0) {
return num_rows + 1;
} else if (src_info.is_validity) {
return (num_rows + 31) / 32;
Expand Down
Loading

0 comments on commit d791e20

Please sign in to comment.