Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle nested string columns with no children in contiguous_split. #6864

Merged
merged 15 commits into from
Jan 12, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@
- PR #6855 Fix `.str.replace_with_backrefs` docs examples
- PR #6853 Fix contiguous split of null string columns
- PR #6861 Fix compile error in type_dispatch_benchmark.cu
- PR #6864 Handle contiguous_split corner case for nested string columns with no children

nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
- PR #6869 Avoid dependency resolution failure in latest version of pip by explicitly specifying versions for dask and distributed
- PR #6806 Force install of local conda artifacts
- PR #6887 Fix typo and `0-d` numpy array handling in binary operation
Expand Down
122 changes: 74 additions & 48 deletions cpp/src/copying/contiguous_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ template <typename InputIter>
size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
{
return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) {
auto const num_buffers = (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0);
auto const num_buffers = 1 + (col.nullable() ? 1 : 0);
return stack_size + (offset_depth * num_buffers) +
compute_offset_stack_size(
col.child_begin(), col.child_end(), offset_depth + is_offset_type(col.type().id()));
Expand Down Expand Up @@ -302,11 +302,11 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf
out_buf++;
}
// NOTE: we're always returning the base pointer here. column-level offset is accounted
// for later.
if (col.head() != nullptr) {
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;
}
// for later. Also, for some column types (string, list, struct) this pointer will be null
// because there is no associated data with the root column.
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;

out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf);
});
return out_buf;
Expand All @@ -331,8 +331,7 @@ template <typename InputIter>
size_type count_src_bufs(InputIter begin, InputIter end)
{
auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
return (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0) +
count_src_bufs(col.child_begin(), col.child_end());
return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end());
});
return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
}
Expand Down Expand Up @@ -419,38 +418,53 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::string_vi
int parent_offset_index,
int offset_depth)
{
strings_column_view scv(col);

if (col.nullable()) {
std::tie(current, offset_stack_pos) =
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

auto offset_col = current;
// string columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRING, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// info for the offsets buffer
*current = src_buf_info(type_id::INT32,
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());
// string columns don't necessarily have children
if (col.num_children() > 0) {
CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column");
strings_column_view scv(col);

// info for the offsets buffer
auto offset_col = current;
CUDF_EXPECTS(scv.offsets().nullable() == false, "Encountered nullable string offsets column");
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the string column
// has been created with empty_like().
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());

// prevent appending buf_info for non-exist chars buffer
if (scv.chars_size() > 0) {
current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

// prevent appending buf_info for non-existent chars buffer
CUDF_EXPECTS(scv.chars().nullable() == false, "Encountered nullable string chars column");

// info for the chars buffer
*current = src_buf_info(
type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;
}

return {current + 1, offset_stack_pos + offset_depth};
return {current, offset_stack_pos};
}

template <>
Expand All @@ -468,10 +482,20 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

auto offset_col = current;
// list columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::LIST, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed list column");

// info for the offsets buffer
*current = src_buf_info(type_id::INT32,
auto offset_col = current;
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the lists column
// has been created with empty_like().
lcv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
Expand All @@ -480,7 +504,7 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

Expand All @@ -506,6 +530,13 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::struct_vi
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

// struct columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRUCT, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// recurse on children
return setup_source_buf_info(col.child_begin(),
col.child_end(),
Expand Down Expand Up @@ -588,29 +619,19 @@ BufInfo build_output_columns(InputIter begin,
}
return std::make_pair(static_cast<bitmask_type const*>(nullptr), 0);
}();
uint8_t const* data_ptr;
size_type size;
std::tie(data_ptr, size) = [&]() {
if (src.head()) {
auto const ptr = base_ptr + current_info->dst_offset;
// if we have data, num_elements will always be the correct size.
// we don't want to use num_rows because if we are an offset column, num_rows
// represents the # of rows of our owning parent. num_elements always represents
// the proper size for this column
auto const size = current_info->num_elements;
++current_info;
return std::make_pair(ptr, size);
}
// Parent columns w/o data (e.g., strings, lists) don't have an associated `dst_buf_info`,
// therefore, use the first child's info if it has at least one child. Their num_rows value
// will be correct (also see comment above)
auto const size = (src.num_children() == 0) ? 0 : current_info->num_rows;
return std::make_pair(static_cast<uint8_t const*>(nullptr), size);
}();

// size/data pointer for the column
auto const size = current_info->num_elements;
uint8_t const* data_ptr =
size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset;
++current_info;

// children
auto children = std::vector<column_view>{};
children.reserve(src.num_children());
current_info = build_output_columns(
src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);

return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
});

Expand Down Expand Up @@ -822,8 +843,13 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int row_end = d_indices[split_index + 1] + src_info.column_offset;
while (stack_size > 0) {
stack_size--;
row_start = d_src_buf_info[offset_stack[stack_size]].offsets[row_start];
row_end = d_src_buf_info[offset_stack[stack_size]].offsets[row_end];
auto const offsets = d_src_buf_info[offset_stack[stack_size]].offsets;
// this case can happen when you have empty string or list columns constructed with
// empty_like()
if (offsets != nullptr) {
row_start = offsets[row_start];
row_end = offsets[row_end];
}
}

// final row indices and row count
Expand All @@ -835,7 +861,7 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int const bit_shift = src_info.is_validity ? row_start % 32 : 0;
// # of rows isn't necessarily the same as # of elements to be copied.
auto const num_elements = [&]() {
if (src_info.offsets != nullptr) {
if (src_info.offsets != nullptr && num_rows > 0) {
return num_rows + 1;
} else if (src_info.is_validity) {
return (num_rows + 31) / 32;
Expand Down Expand Up @@ -972,4 +998,4 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
return cudf::detail::contiguous_split(input, splits, rmm::cuda_stream_default, mr);
}

}; // namespace cudf
}; // namespace cudf
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
Loading