Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eliminate duplicate allocation of nested string columns #15142

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
// TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
// chunked reader).
auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
std::vector<size_t> col_sizes(_input_columns.size(), 0L);
std::vector<size_t> col_string_sizes(_input_columns.size(), 0L);
if (has_strings) {
ComputePageStringSizes(subpass.pages,
pass.chunks,
Expand All @@ -71,10 +71,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
kernel_mask,
_stream);

col_sizes = calculate_page_string_offsets();
col_string_sizes = calculate_page_string_offsets();

// check for overflow
if (std::any_of(col_sizes.cbegin(), col_sizes.cend(), [](size_t sz) {
if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](size_t sz) {
return sz > std::numeric_limits<size_type>::max();
})) {
CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
Expand Down Expand Up @@ -157,8 +157,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
valids[idx] = out_buf.null_mask();
data[idx] = out_buf.data();
// only do string buffer for leaf
if (out_buf.string_size() == 0 && col_sizes[pass.chunks[c].src_col_index] > 0) {
out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream);
if (idx == max_depth - 1 and out_buf.string_size() == 0 and
col_string_sizes[pass.chunks[c].src_col_index] > 0) {
out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream);
}
if (has_strings) { str_data[idx] = out_buf.string_data(); }
out_buf.user_data |=
Expand Down Expand Up @@ -281,7 +282,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
} else if (out_buf.type.id() == type_id::STRING) {
// need to cap off the string offsets column
size_type const sz = static_cast<size_type>(col_sizes[idx]);
size_type const sz = static_cast<size_type>(col_string_sizes[idx]);
cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
&sz,
sizeof(size_type),
Expand Down
Loading