Skip to content

Commit

Permalink
Fix scatter for all-empty-string column case (#10724)
Browse files Browse the repository at this point in the history
Closes #10717 

Fixes bug introduced with changes in #10673 which uses the `cudf::make_strings_column` that accepts a span of `string_view` objects with a null-placeholder. The placeholder can be unintentionally created in `create_string_vector_from_column` when given a strings column where all the rows are empty. The utility is fixed to prevent creating the placeholder for empty strings.

A gtest was added to scatter from/to an all-empty strings column to verify this behavior.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #10724
  • Loading branch information
davidwendt authored Apr 27, 2022
1 parent 1f8a03e commit 3d92bf2
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 15 deletions.
7 changes: 6 additions & 1 deletion cpp/include/cudf/strings/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,13 @@ std::unique_ptr<column> scatter(
// create vector of string_view's to scatter into
rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);

// this ensures empty strings are not mapped to nulls in the make_strings_column function
auto const size = thrust::distance(begin, end);
auto itr = thrust::make_transform_iterator(
begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; });

// do the scatter
thrust::scatter(rmm::exec_policy(stream), begin, end, scatter_map, target_vector.begin());
thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin());

// build the output column
auto sv_span = cudf::device_span<string_view const>(target_vector);
Expand Down
15 changes: 10 additions & 5 deletions cpp/src/lists/copying/scatter_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ struct list_child_constructor {

auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);

auto const null_string_view = string_view{nullptr, 0}; // placeholder for factory function

thrust::transform(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
Expand All @@ -241,7 +243,8 @@ struct list_child_constructor {
offset_size = list_offsets.size(),
d_list_vector = list_vector.begin(),
source_lists,
target_lists] __device__(auto index) {
target_lists,
null_string_view] __device__(auto index) {
auto const list_index_iter =
thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
auto const list_index =
Expand All @@ -254,14 +257,16 @@ struct list_child_constructor {
auto child_strings_column = lists_column.child();
auto strings_offset = lists_offsets_ptr[row_index] + intra_index;

return child_strings_column.is_null(strings_offset)
? string_view{nullptr, 0}
: child_strings_column.template element<string_view>(strings_offset);
if (child_strings_column.is_null(strings_offset)) { return null_string_view; }
auto const d_str = child_strings_column.template element<string_view>(strings_offset);
// ensure a string from an all-empty column is not mapped to the null placeholder
auto const empty_string_view = string_view{};
return d_str.empty() ? empty_string_view : d_str;
});

// string_views should now have been populated with source and target references.
auto sv_span = cudf::device_span<string_view const>(string_views);
return cudf::make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr);
return cudf::make_strings_column(sv_span, null_string_view, stream, mr);
}

/**
Expand Down
22 changes: 14 additions & 8 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,20 @@ rmm::device_uvector<string_view> create_string_vector_from_column(

auto strings_vector = rmm::device_uvector<string_view>(input.size(), stream, mr);

thrust::transform(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
strings_vector.begin(),
[d_strings = *d_strings] __device__(size_type idx) {
return d_strings.is_null(idx) ? string_view{nullptr, 0} : d_strings.element<string_view>(idx);
});
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
strings_vector.begin(),
[d_strings = *d_strings] __device__(size_type idx) {
// placeholder for factory function that takes a span of string_views
auto const null_string_view = string_view{nullptr, 0};
if (d_strings.is_null(idx)) { return null_string_view; }
auto const d_str = d_strings.element<string_view>(idx);
// special case when the entire column is filled with empty strings:
// here the empty d_str may have a d_str.data() == nullptr
auto const empty_string_view = string_view{};
return d_str.empty() ? empty_string_view : d_str;
});

return strings_vector;
}
Expand Down
13 changes: 12 additions & 1 deletion cpp/tests/copying/scatter_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -573,6 +573,17 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls)
CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
}

TEST_F(ScatterStringsTests, EmptyStrings)
{
cudf::test::strings_column_wrapper input{"", "", ""};
cudf::table_view t({input});

// Test for issue 10717: all-empty-string column scatter
auto map = cudf::test::fixed_width_column_wrapper<int32_t>({0});
auto result = cudf::scatter(t, map, t);
CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), t);
}

template <typename T>
class BooleanMaskScatter : public cudf::test::BaseFixture {
};
Expand Down

0 comments on commit 3d92bf2

Please sign in to comment.