Skip to content

Commit

Permalink
Remove unneeded step parameter from strings::detail::copy_slice (rapi…
Browse files Browse the repository at this point in the history
…dsai#7525)

This started to be a change converting some `device_vector` usages in `cpp/src/strings` source files to use `device_uvector` instead. The `cpp/src/strings/copying/copying.cu` source has the implementation for `cudf::strings::detail::copy_slice()` and used a `device_vector` to handle a `step` parameter. I can not longer find this parameter being used. I believe it was a hold over from porting nvstrings. So this PR mainly includes changes for removing this unneeded parameter which also removes the need for the `device_vector` or temporary memory in this function.
And, it also includes changes to `attributes.cu` to use the `device_uvector` as well.

~~I'm marking this as non-breaking change since it is a change to a `detail` API and did not seem to be used anywhere in this repo.~~

Reference rapidsai#7287

Authors:
  - David (@davidwendt)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - Mike Wilson (@hyperbolic2346)
  - Jake Hemstad (@jrhemstad)

URL: rapidsai#7525
  • Loading branch information
davidwendt authored and hyperbolic2346 committed Mar 23, 2021
1 parent b7dd2cd commit 5752ad3
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 122 deletions.
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ test:
- test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
- test -f $PREFIX/include/cudf/strings/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/combine.hpp
- test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/strings/detail/converters.hpp
- test -f $PREFIX/include/cudf/strings/detail/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/fill.hpp
- test -f $PREFIX/include/cudf/strings/detail/replace.hpp
- test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,24 +25,24 @@ namespace strings {
namespace detail {
/**
* @brief Returns a new strings column created from a subset of
* of the strings column. The subset of strings selected is between
* start (inclusive) and end (exclusive) with increments of step.
* of the strings column.
*
* The subset of strings selected is between
* start (inclusive) and end (exclusive).
*
* @code{.pseudo}
* Example:
* s1 = ["a", "b", "c", "d", "e", "f"]
* s2 = slice( s1, 2 )
* s2 = copy_slice( s1, 2 )
* s2 is ["c", "d", "e", "f"]
* s3 = slice( s1, 1, 2 )
* s3 is ["b", "d", "f"]
* s2 = copy_slice( s1, 1, 3 )
* s2 is ["b", "c"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param start Index to first string to select in the column (inclusive).
* @param end Index to last string to select in the column (exclusive).
* Default -1 indicates the last element.
* @param step Increment value between indices.
* Default step is 1.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column of size (end-start)/step.
Expand All @@ -51,7 +51,6 @@ std::unique_ptr<cudf::column> copy_slice(
strings_column_view const& strings,
size_type start,
size_type end = -1,
size_type step = 1,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/column/column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <cudf/lists/detail/copying.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
Expand Down Expand Up @@ -193,7 +193,7 @@ struct create_column_from_view {
std::unique_ptr<column> operator()()
{
cudf::strings_column_view sview(view);
return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream, mr);
return cudf::strings::detail::copy_slice(sview, 0, view.size(), stream, mr);
}

template <typename ColumnType,
Expand Down
44 changes: 21 additions & 23 deletions cpp/src/strings/attributes.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,7 +25,7 @@
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/transform.h>
Expand Down Expand Up @@ -54,28 +54,26 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
// create output column
auto results = make_numeric_column(data_type{type_id::INT32},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_lengths = results->mutable_view().data<int32_t>();
// input column device view
auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// create output column
auto results = std::make_unique<cudf::column>(
cudf::data_type{type_id::INT32},
strings_count,
rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr),
cudf::detail::copy_bitmask(strings.parent(), stream, mr), // copy the null mask
strings.null_count());
auto results_view = results->mutable_view();
auto d_lengths = results_view.data<int32_t>();
// fill in the lengths
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
thrust::make_counting_iterator<cudf::size_type>(strings.size()),
d_lengths,
[d_strings, ufn] __device__(size_type idx) {
int32_t length = 0;
if (!d_strings.is_null(idx))
length = static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
return length;
return d_strings.is_null(idx)
? 0
: static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
});
results->set_null_count(strings.null_count()); // reset null count
return results;
Expand Down Expand Up @@ -140,23 +138,23 @@ std::unique_ptr<column> code_points(
auto d_column = *strings_column;

// create offsets vector to account for each string's character length
rmm::device_vector<size_type> offsets(strings.size() + 1);
size_type* d_offsets = offsets.data().get();
rmm::device_uvector<size_type> offsets(strings.size() + 1, stream);
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_offsets + 1,
offsets.begin() + 1,
[d_column] __device__(size_type idx) {
size_type length = 0;
if (!d_column.is_null(idx)) length = d_column.element<string_view>(idx).length();
return length;
},
thrust::plus<size_type>());
CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream.value()));
size_type const zero = 0;
offsets.set_element_async(0, zero, stream);

// the total size is the number of characters in the entire column
size_type num_characters = offsets.back();
size_type num_characters = offsets.back_element(stream);
// create output column with no nulls
auto results = make_numeric_column(
data_type{type_id::INT32}, num_characters, mask_state::UNALLOCATED, stream, mr);
Expand All @@ -167,7 +165,7 @@ std::unique_ptr<column> code_points(
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings.size(),
code_points_fn{d_column, d_offsets, d_results});
code_points_fn{d_column, offsets.data(), d_results});

results->set_null_count(0);
return results;
Expand Down
90 changes: 40 additions & 50 deletions cpp/src/strings/copying/copying.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,77 +14,67 @@
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/sequence.h>

namespace cudf {
namespace strings {
namespace detail {
// new strings column from subset of this strings instance

std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
size_type start,
size_type end,
size_type step,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);
if (step == 0) step = 1;
CUDF_EXPECTS(step > 0, "Parameter step must be positive integer.");
if (end < 0 || end > strings_count) end = strings_count;
if (strings.is_empty()) return make_empty_strings_column(stream, mr);
if (end < 0 || end > strings.size()) end = strings.size();
CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
strings_count = cudf::util::round_up_safe<size_type>((end - start), step);
if (start == 0 && strings.offset() == 0 && step == 1) {
// sliced at the beginning and copying every step, so no need to gather
auto offsets_column = std::make_unique<cudf::column>(
cudf::slice(strings.offsets(), {0, strings_count + 1}).front(), stream, mr);
auto data_size =
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
auto chars_column = std::make_unique<cudf::column>(
cudf::slice(strings.chars(), {0, data_size}).front(), stream, mr);
auto null_mask = cudf::detail::copy_bitmask(strings.null_mask(), 0, strings_count, stream, mr);
return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
UNKNOWN_NULL_COUNT,
std::move(null_mask),
stream,
mr);
auto const strings_count = end - start;
auto const offsets_offset = start + strings.offset();

// slice the offsets child column
auto offsets_column = std::make_unique<cudf::column>(
cudf::slice(strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}).front(),
stream,
mr);
auto const chars_offset =
offsets_offset == 0 ? 0 : cudf::detail::get_value<int32_t>(offsets_column->view(), 0, stream);
if (chars_offset > 0) {
// adjust the individual offset values only if needed
auto d_offsets = offsets_column->mutable_view();
thrust::transform(rmm::exec_policy(stream),
d_offsets.begin<int32_t>(),
d_offsets.end<int32_t>(),
d_offsets.begin<int32_t>(),
[chars_offset] __device__(auto offset) { return offset - chars_offset; });
}

// do the gather instead
// build indices
rmm::device_vector<size_type> indices(strings_count);
thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end(), start, step);
// create a column_view as a wrapper of these indices
column_view indices_view(
data_type{type_id::INT32}, strings_count, indices.data().get(), nullptr, 0);
// build a new strings column from the indices
auto sliced_table = cudf::detail::gather(table_view{{strings.parent()}},
indices_view,
cudf::out_of_bounds_policy::DONT_CHECK,
cudf::detail::negative_index_policy::NOT_ALLOWED,
stream,
mr)
->release();
std::unique_ptr<column> output_column(std::move(sliced_table.front()));
if (output_column->null_count() == 0)
output_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
return output_column;
// slice the chars child column
auto const data_size =
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
auto chars_column = std::make_unique<cudf::column>(
cudf::slice(strings.chars(), {chars_offset, chars_offset + data_size}).front(), stream, mr);

// slice the null mask
auto null_mask = cudf::detail::copy_bitmask(
strings.null_mask(), offsets_offset, offsets_offset + strings_count, stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
UNKNOWN_NULL_COUNT,
std::move(null_mask),
stream,
mr);
}

} // namespace detail
Expand Down
58 changes: 21 additions & 37 deletions cpp/tests/strings/array_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/scatter.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -71,20 +71,17 @@ TEST_P(SliceParmsTest, Slice)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
// thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::strings_column_wrapper expected(
h_strings.begin() + start,
h_strings.begin() + end,
thrust::make_transform_iterator(h_strings.begin() + start,
[](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_P(SliceParmsTest, SliceAllNulls)
Expand All @@ -94,42 +91,29 @@ TEST_P(SliceParmsTest, SliceAllNulls)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
h_strings.begin() + start,
h_strings.begin() + end,
thrust::make_transform_iterator(h_strings.begin() + start,
[](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_P(SliceParmsTest, SliceAllEmpty)
{
std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
// thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

Expand Down

0 comments on commit 5752ad3

Please sign in to comment.