Skip to content

Commit

Permalink
Fix memory access error in cudf::shift for sliced strings (#13894)
Browse files Browse the repository at this point in the history
Fixes `cudf::strings::detail::shift` logic with sliced input strings column when copying the chars data to the output column.
Added additional tests including a null fill scalar.

Closes #13852

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #13894
  • Loading branch information
davidwendt authored Aug 23, 2023
1 parent 62148b4 commit 88a8efc
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 53 deletions.
84 changes: 37 additions & 47 deletions cpp/src/strings/copying/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>

Expand All @@ -31,35 +33,31 @@ namespace cudf::strings::detail {

namespace {

struct adjust_offsets_fn {
column_device_view const d_column;
struct output_sizes_fn {
column_device_view const d_column; // input strings column
string_view const d_filler;
size_type const offset;

__device__ size_type get_string_size_at(size_type idx)
{
return d_column.is_null(idx) ? 0 : d_column.element<string_view>(idx).size_bytes();
}

__device__ size_type operator()(size_type idx)
{
auto const last_index = offset < 0 ? d_column.size() + offset : offset;
if (offset < 0) {
auto const first = d_column.element<size_type>(-offset);
auto const last_index = d_column.size() + offset;
if (idx < last_index) {
return d_column.element<size_type>(idx - offset) - first;
} else {
auto const last = d_column.element<size_type>(d_column.size() - 1);
return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
}
// shift left: a,b,c,d,e,f -> b,c,d,e,f,x
return (idx < last_index) ? get_string_size_at(idx - offset) : d_filler.size_bytes();
} else {
if (idx < offset) {
return idx * d_filler.size_bytes();
} else {
auto const total_filler = d_filler.size_bytes() * offset;
return total_filler + d_column.element<size_type>(idx - offset);
}
// shift right: a,b,c,d,e,f -> x,a,b,c,d,e
return (idx < last_index) ? d_filler.size_bytes() : get_string_size_at(idx - offset);
}
}
};

struct shift_chars_fn {
column_device_view const d_column;
column_device_view const d_column; // input strings column
string_view const d_filler;
size_type const offset;

Expand All @@ -68,8 +66,11 @@ struct shift_chars_fn {
if (offset < 0) {
auto const last_index = -offset;
if (idx < last_index) {
auto const first_index = d_column.size() + offset;
return d_column.element<char>(idx + first_index);
auto const first_index =
offset + d_column.child(strings_column_view::offsets_column_index)
.element<size_type>(d_column.offset() + d_column.size());
return d_column.child(strings_column_view::chars_column_index)
.element<char>(idx + first_index);
} else {
auto const char_index = idx - last_index;
return d_filler.data()[char_index % d_filler.size_bytes()];
Expand All @@ -78,7 +79,10 @@ struct shift_chars_fn {
if (idx < offset) {
return d_filler.data()[idx % d_filler.size_bytes()];
} else {
return d_column.element<char>(idx - offset);
return d_column.child(strings_column_view::chars_column_index)
.element<char>(idx - offset +
d_column.child(strings_column_view::offsets_column_index)
.element<size_type>(d_column.offset()));
}
}
}
Expand All @@ -97,44 +101,30 @@ std::unique_ptr<column> shift(strings_column_view const& input,
// adjust offset when greater than the size of the input
if (std::abs(offset) > input.size()) { offset = input.size(); }

// output offsets column is the same size as the input
auto const input_offsets =
cudf::detail::slice(
input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream)
.front();
auto const offsets_size = input_offsets.size();
auto offsets_column = cudf::detail::allocate_like(
input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);

// run kernel to simultaneously shift and adjust the values in the output offsets column
auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
auto const d_input_offsets = column_device_view::create(input_offsets, stream);
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(offsets_size),
d_offsets->data<size_type>(),
adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
// build the output offsets by computing the sizes of each output row
auto const d_input = column_device_view::create(input.parent(), stream);
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0, output_sizes_fn{*d_input, d_fill_str, offset});
auto [offsets_column, total_bytes] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto offsets_view = offsets_column->view();

// compute the shift-offset for the output characters child column
auto const shift_offset = [&] {
auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
return (offset < 0 ? -1 : 1) *
cudf::detail::get_value<size_type>(offsets_column->view(), index, stream);
auto const index = (offset < 0) ? input.size() + offset : offset;
return (offset < 0 ? -1 : 1) * cudf::detail::get_value<size_type>(offsets_view, index, stream);
}();

// create output chars child column
auto const chars_size =
cudf::detail::get_value<size_type>(offsets_column->view(), offsets_size - 1, stream);
auto chars_column = create_chars_child_column(chars_size, stream, mr);
auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream);
auto const d_input_chars = column_device_view::create(input.chars(), stream);

// run kernel to shift the characters
// run kernel to shift all the characters
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(chars_size),
thrust::counting_iterator<size_type>(total_bytes),
d_chars->data<char>(),
shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});
shift_chars_fn{*d_input, d_fill_str, shift_offset});

// caller sets the null-mask
return make_strings_column(
Expand Down
38 changes: 32 additions & 6 deletions cpp/tests/copying/shift_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,22 +206,48 @@ TEST_F(ShiftTests, StringsShiftTest)
auto results = cudf::shift(input, 2, fill);
auto expected_right =
cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results);

results = cudf::shift(input, -2, fill);
auto expected_left =
cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results);

auto sliced = cudf::slice(input, {1, 4}).front();

results = cudf::shift(sliced, 1, fill);
auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);
auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results);

results = cudf::shift(sliced, -1, fill);
auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results);
}

TEST_F(ShiftTests, StringsShiftNullFillTest)
{
auto input = cudf::test::strings_column_wrapper(
{"a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj"});
auto phil = cudf::string_scalar("", false);

auto results = cudf::shift(input, -1, phil);
auto expected = cudf::test::strings_column_wrapper(
{"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = cudf::shift(input, 1, phil);
expected = cudf::test::strings_column_wrapper(
{"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

auto sliced = cudf::slice(input, {5, 10}).front();
results = cudf::shift(sliced, -2, phil);
expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = cudf::shift(sliced, 2, phil);
expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(ShiftTests, OffsetGreaterThanSize)
Expand Down

0 comments on commit 88a8efc

Please sign in to comment.