Skip to content

Commit

Permalink
Add strings support to cudf::shift function (#8648)
Browse files Browse the repository at this point in the history
Closes #3915 

Added strings specialization logic to the current `cudf::shift` API. Rows are shifted by running a transform functions on the offsets and chars child columns to create the output strings column. The gtests are added to the `copying/shift_tests.cpp`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: #8648
  • Loading branch information
davidwendt authored Jul 9, 2021
1 parent ecbec81 commit 1b34652
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 18 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ add_library(cudf
src/strings/convert/convert_urls.cu
src/strings/copying/concatenate.cu
src/strings/copying/copying.cu
src/strings/copying/shift.cu
src/strings/extract.cu
src/strings/filling/fill.cu
src/strings/filter_chars.cu
Expand Down
28 changes: 28 additions & 0 deletions cpp/include/cudf/strings/detail/copying.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -54,6 +55,33 @@ std::unique_ptr<cudf::column> copy_slice(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a new strings column created by shifting the rows by a specified offset.
*
* @code{.pseudo}
* Example:
* s = ["a", "b", "c", "d", "e", "f"]
* r1 = shift(s, 2, "_")
* r1 is now ["_", "_", "a", "b", "c", "d"]
* r2 = shift(s, -2, "_")
* r2 is now ["c", "d", "e", "f", "_", "_"]
* @endcode
*
* The caller should set the validity mask in the output column.
*
* @param input Strings instance for this operation.
* @param offset The offset by which to shift the input.
* @param fill_value Fill value for indeterminable outputs.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column.
*/
std::unique_ptr<column> shift(strings_column_view const& input,
size_type offset,
scalar const& fill_value,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace strings
} // namespace cudf
69 changes: 52 additions & 17 deletions cpp/src/copying/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -44,13 +45,55 @@ inline bool __device__ out_of_bounds(size_type size, size_type idx)
return idx < 0 || idx >= size;
}

std::pair<rmm::device_buffer, size_type> create_null_mask(column_device_view const& input,
size_type offset,
scalar const& fill_value,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const size = input.size();
auto func_validity =
[size, offset, fill = fill_value.validity_data(), input] __device__(size_type idx) {
auto src_idx = idx - offset;
return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx);
};
return detail::valid_if(thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(size),
func_validity,
stream,
mr);
}

struct shift_functor {
template <typename T, typename... Args>
std::enable_if_t<not cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
std::enable_if_t<not cudf::is_fixed_width<T>() and not std::is_same_v<cudf::string_view, T>,
std::unique_ptr<column>>
operator()(Args&&...)
{
CUDF_FAIL("shift does not support non-fixed-width types.");
}

template <typename T, typename... Args>
std::enable_if_t<std::is_same_v<cudf::string_view, T>, std::unique_ptr<column>> operator()(
column_view const& input,
size_type offset,
scalar const& fill_value,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto output = cudf::strings::detail::shift(
cudf::strings_column_view(input), offset, fill_value, stream, mr);

if (input.nullable() || not fill_value.is_valid(stream)) {
auto const d_input = column_device_view::create(input, stream);
auto mask_pair = create_null_mask(*d_input, offset, fill_value, stream, mr);
output->set_null_mask(std::move(std::get<0>(mask_pair)));
output->set_null_count(std::get<1>(mask_pair));
}

return output;
}

template <typename T>
std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
column_view const& input,
Expand All @@ -67,29 +110,21 @@ struct shift_functor {
detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
auto device_output = mutable_column_device_view::create(*output);

auto size = input.size();
auto index_begin = thrust::make_counting_iterator<size_type>(0);
auto index_end = thrust::make_counting_iterator<size_type>(size);

if (input.nullable() || not scalar.is_valid()) {
auto func_validity = [size,
offset,
fill = scalar.validity_data(),
input = *device_input] __device__(size_type idx) {
auto src_idx = idx - offset;
return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx);
};

auto mask_pair = detail::valid_if(index_begin, index_end, func_validity, stream, mr);
auto const scalar_is_valid = scalar.is_valid(stream);

if (input.nullable() || not scalar_is_valid) {
auto mask_pair = create_null_mask(*device_input, offset, fill_value, stream, mr);
output->set_null_mask(std::move(std::get<0>(mask_pair)));
output->set_null_count(std::get<1>(mask_pair));
}

auto data = device_output->data<T>();
auto const size = input.size();
auto index_begin = thrust::make_counting_iterator<size_type>(0);
auto index_end = thrust::make_counting_iterator<size_type>(size);
auto data = device_output->data<T>();

// avoid assigning elements we know to be invalid.
if (not scalar.is_valid()) {
if (not scalar_is_valid) {
if (offset > 0) {
index_begin = thrust::make_counting_iterator<size_type>(offset);
data = data + offset;
Expand Down
144 changes: 144 additions & 0 deletions cpp/src/strings/copying/shift.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

namespace cudf::strings::detail {

namespace {

struct adjust_offsets_fn {
column_device_view const d_column;
string_view const d_filler;
size_type const offset;

__device__ offset_type operator()(size_type idx)
{
if (offset < 0) {
auto const first = d_column.element<offset_type>(-offset);
auto const last_index = d_column.size() + offset;
if (idx < last_index) {
return d_column.element<offset_type>(idx - offset) - first;
} else {
auto const last = d_column.element<offset_type>(d_column.size() - 1);
return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
}
} else {
if (idx < offset) {
return idx * d_filler.size_bytes();
} else {
auto const total_filler = d_filler.size_bytes() * offset;
return total_filler + d_column.element<offset_type>(idx - offset);
}
}
}
};

struct shift_chars_fn {
column_device_view const d_column;
string_view const d_filler;
size_type const offset;

__device__ char operator()(size_type idx)
{
if (offset < 0) {
auto const last_index = -offset;
if (idx < last_index) {
auto const first_index = d_column.size() + offset;
return d_column.element<char>(idx + first_index);
} else {
auto const char_index = idx - last_index;
return d_filler.data()[char_index % d_filler.size_bytes()];
}
} else {
if (idx < offset) {
return d_filler.data()[idx % d_filler.size_bytes()];
} else {
return d_column.element<char>(idx - offset);
}
}
}
};

} // namespace

std::unique_ptr<column> shift(strings_column_view const& input,
size_type offset,
scalar const& fill_value,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto d_fill_str = static_cast<string_scalar const&>(fill_value).value(stream);

// output offsets column is the same size as the input
auto const input_offsets =
cudf::slice(input.offsets(), {input.offset(), input.offset() + input.size() + 1}).front();
auto const offsets_size = input_offsets.size();
auto offsets_column = cudf::detail::allocate_like(
input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);

// run kernel to simultaneously shift and adjust the values in the output offsets column
auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
auto const d_input_offsets = column_device_view::create(input_offsets, stream);
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(offsets_size),
d_offsets->data<offset_type>(),
adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});

// compute the shift-offset for the output characters child column
auto const shift_offset = [&] {
auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
return (offset < 0 ? -1 : 1) *
cudf::detail::get_value<offset_type>(offsets_column->view(), index, stream);
}();

// create output chars child column
auto const chars_size =
cudf::detail::get_value<offset_type>(offsets_column->view(), offsets_size - 1, stream);
auto chars_column = create_chars_child_column(chars_size, stream, mr);
auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream);
auto const d_input_chars = column_device_view::create(input.chars(), stream);

// run kernel to shift the characters
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(chars_size),
d_chars->data<char>(),
shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});

// caller sets the null-mask
return make_strings_column(input.size(),
std::move(offsets_column),
std::move(chars_column),
0,
rmm::device_buffer{},
stream,
mr);
}

} // namespace cudf::strings::detail
32 changes: 31 additions & 1 deletion cpp/tests/copying/shift_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -201,3 +201,33 @@ TYPED_TEST(ShiftTest, MismatchFillValueDtypes)

EXPECT_THROW(output = cudf::shift(input, 5, *fill), cudf::logic_error);
}

struct ShiftTestNonFixedWidth : public cudf::test::BaseFixture {
};

TEST_F(ShiftTestNonFixedWidth, StringsShiftTest)
{
auto input =
cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddddddé", ""}, {0, 1, 1, 1, 0});

auto fill = cudf::string_scalar("xx");
auto results = cudf::shift(input, 2, fill);
auto expected_right =
cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);

results = cudf::shift(input, -2, fill);
auto expected_left =
cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);

auto sliced = cudf::slice(input, {1, 4}).front();

results = cudf::shift(sliced, 1, fill);
auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);

results = cudf::shift(sliced, -1, fill);
auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
}

0 comments on commit 1b34652

Please sign in to comment.