Skip to content

Commit

Permalink
Always add a src/dst buf record for columns to be copied, even if the…
Browse files Browse the repository at this point in the history
… data is nullptr/size is 0. Cleaner and avoids

odd edge cases that can crop up with strangely formed columns (eg, ones that have size 0, but a non-null data pointer).
  • Loading branch information
nvdbaranec committed Jan 4, 2021
1 parent ed4dca7 commit 523aef7
Show file tree
Hide file tree
Showing 2 changed files with 264 additions and 74 deletions.
155 changes: 83 additions & 72 deletions cpp/src/copying/contiguous_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ template <typename InputIter>
size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
{
return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) {
auto const num_buffers = (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0);
auto const num_buffers = 1 + (col.nullable() ? 1 : 0);
return stack_size + (offset_depth * num_buffers) +
compute_offset_stack_size(
col.child_begin(), col.child_end(), offset_depth + is_offset_type(col.type().id()));
Expand Down Expand Up @@ -302,11 +302,11 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf
out_buf++;
}
// NOTE: we're always returning the base pointer here. column-level offset is accounted
// for later.
if (col.head() != nullptr) {
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;
}
// for later. Also, for some column types (string, list, struct) this pointer will be null
// because there is no associated data with the root column.
*out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
out_buf++;

out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf);
});
return out_buf;
Expand All @@ -331,8 +331,7 @@ template <typename InputIter>
size_type count_src_bufs(InputIter begin, InputIter end)
{
auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
return (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0) +
count_src_bufs(col.child_begin(), col.child_end());
return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end());
});
return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
}
Expand Down Expand Up @@ -390,15 +389,10 @@ struct buf_info_functor {
}

// info for the data buffer
if (col.head()) {
*current = src_buf_info(
col.type().id(), nullptr, offset_stack_pos, parent_offset_index, false, col.offset());

current++;
offset_stack_pos += offset_depth;
}
*current = src_buf_info(
col.type().id(), nullptr, offset_stack_pos, parent_offset_index, false, col.offset());

return {current, offset_stack_pos};
return {current + 1, offset_stack_pos + offset_depth};
}

private:
Expand Down Expand Up @@ -429,40 +423,45 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::string_vi
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

// string columns can have no children and still be valid
// string columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRING, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// string columns don't necessarily have children
if (col.num_children() > 0) {
CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column");
strings_column_view scv(col);

auto offset_col = current;

// info for the offsets buffer
if (scv.offsets().head()) {
CUDF_EXPECTS(scv.offsets().nullable() == false, "Encountered nullable string offsets column");
*current = src_buf_info(type_id::INT32,
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());

current++;
offset_stack_pos += offset_depth;
}

// prevent appending buf_info for non-exist chars buffer
if (scv.chars().head()) {
CUDF_EXPECTS(scv.chars().nullable() == false, "Encountered nullable string chars column");

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
offset_depth++;
parent_offset_index = offset_col - head;

// info for the chars buffer
*current = src_buf_info(
type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;
}
auto offset_col = current;
CUDF_EXPECTS(scv.offsets().nullable() == false, "Encountered nullable string offsets column");
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the string column
// has been created with empty_like().
scv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
false,
col.offset());

current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

// prevent appending buf_info for non-existent chars buffer
CUDF_EXPECTS(scv.chars().nullable() == false, "Encountered nullable string chars column");

// info for the chars buffer
*current = src_buf_info(
type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;
}

return {current, offset_stack_pos};
Expand All @@ -483,10 +482,20 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

auto offset_col = current;
// list columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::LIST, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed list column");

// info for the offsets buffer
*current = src_buf_info(type_id::INT32,
auto offset_col = current;
*current = src_buf_info(type_id::INT32,
// note: offsets can be null in the case where the lists column
// has been created with empty_like().
lcv.offsets().begin<cudf::id_to_type<type_id::INT32>>(),
offset_stack_pos,
parent_offset_index,
Expand All @@ -495,7 +504,7 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::list_view
current++;
offset_stack_pos += offset_depth;

// since we are crossing an offset boundary, our offset_depth and parent_offset_index go up.
// since we are crossing an offset boundary, calculate our new depth and parent offset index.
offset_depth++;
parent_offset_index = offset_col - head;

Expand All @@ -521,6 +530,13 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::struct_vi
add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
}

// struct columns hold no actual data, but we need to keep a record
// of it so we know it's size when we are constructing the output columns
*current = src_buf_info(
type_id::STRUCT, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
current++;
offset_stack_pos += offset_depth;

// recurse on children
return setup_source_buf_info(col.child_begin(),
col.child_end(),
Expand Down Expand Up @@ -603,29 +619,19 @@ BufInfo build_output_columns(InputIter begin,
}
return std::make_pair(static_cast<bitmask_type const*>(nullptr), 0);
}();
uint8_t const* data_ptr;
size_type size;
std::tie(data_ptr, size) = [&]() {
if (src.head()) {
auto const ptr = base_ptr + current_info->dst_offset;
// if we have data, num_elements will always be the correct size.
// we don't want to use num_rows because if we are an offset column, num_rows
// represents the # of rows of our owning parent. num_elements always represents
// the proper size for this column
auto const size = current_info->num_elements;
++current_info;
return std::make_pair(ptr, size);
}
// Parent columns w/o data (e.g., strings, lists) don't have an associated `dst_buf_info`,
// therefore, use the first child's info if it has at least one child. Their num_rows value
// will be correct (also see comment above)
auto const size = (src.num_children() == 0) ? 0 : current_info->num_rows;
return std::make_pair(static_cast<uint8_t const*>(nullptr), size);
}();

// size/data pointer for the column
auto const size = current_info->num_elements;
uint8_t const* data_ptr =
size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset;
++current_info;

// children
auto children = std::vector<column_view>{};
children.reserve(src.num_children());
current_info = build_output_columns(
src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);

return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
});

Expand Down Expand Up @@ -837,8 +843,13 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int row_end = d_indices[split_index + 1] + src_info.column_offset;
while (stack_size > 0) {
stack_size--;
row_start = d_src_buf_info[offset_stack[stack_size]].offsets[row_start];
row_end = d_src_buf_info[offset_stack[stack_size]].offsets[row_end];
auto const offsets = d_src_buf_info[offset_stack[stack_size]].offsets;
// this case can happen when you have empty string or list columns constructed with
// empty_like()
if (offsets != nullptr) {
row_start = offsets[row_start];
row_end = offsets[row_end];
}
}

// final row indices and row count
Expand All @@ -850,7 +861,7 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
int const bit_shift = src_info.is_validity ? row_start % 32 : 0;
// # of rows isn't necessarily the same as # of elements to be copied.
auto const num_elements = [&]() {
if (src_info.offsets != nullptr) {
if (src_info.offsets != nullptr && num_rows > 0) {
return num_rows + 1;
} else if (src_info.is_validity) {
return (num_rows + 31) / 32;
Expand Down Expand Up @@ -987,4 +998,4 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
return cudf::detail::contiguous_split(input, splits, rmm::cuda_stream_default, mr);
}

}; // namespace cudf
}; // namespace cudf
Loading

0 comments on commit 523aef7

Please sign in to comment.