Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unneeded step parameter from strings::detail::copy_slice #7525

Merged
merged 9 commits into from
Mar 11, 2021
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,10 @@ test:
- test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
- test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
- test -f $PREFIX/include/cudf/strings/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/combine.hpp
- test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/strings/detail/converters.hpp
- test -f $PREFIX/include/cudf/strings/detail/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/fill.hpp
- test -f $PREFIX/include/cudf/strings/detail/replace.hpp
- test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,24 +25,24 @@ namespace strings {
namespace detail {
/**
* @brief Returns a new strings column created from a subset of
* of the strings column. The subset of strings selected is between
* start (inclusive) and end (exclusive) with increments of step.
* of the strings column.
*
* The subset of strings selected is between
* start (inclusive) and end (exclusive).
*
* @code{.pseudo}
* Example:
* s1 = ["a", "b", "c", "d", "e", "f"]
* s2 = slice( s1, 2 )
* s2 = copy_slice( s1, 2 )
* s2 is ["c", "d", "e", "f"]
* s3 = slice( s1, 1, 2 )
* s3 is ["b", "d", "f"]
* s2 = copy_slice( s1, 1, 3 )
* s2 is ["b", "c"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param start Index to first string to select in the column (inclusive).
* @param end Index to last string to select in the column (exclusive).
* Default -1 indicates the last element.
* @param step Increment value between indices.
* Default step is 1.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column of size (end-start)/step.
Expand All @@ -51,7 +51,6 @@ std::unique_ptr<cudf::column> copy_slice(
strings_column_view const& strings,
size_type start,
size_type end = -1,
size_type step = 1,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/column/column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <cudf/lists/detail/copying.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
Expand Down Expand Up @@ -193,7 +193,7 @@ struct create_column_from_view {
std::unique_ptr<column> operator()()
{
cudf::strings_column_view sview(view);
return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream, mr);
return cudf::strings::detail::copy_slice(sview, 0, view.size(), stream, mr);
}

template <typename ColumnType,
Expand Down
43 changes: 20 additions & 23 deletions cpp/src/strings/attributes.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,7 +25,7 @@
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/transform.h>
Expand Down Expand Up @@ -54,28 +54,26 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
// create output column
auto results = make_numeric_column(data_type{type_id::INT32},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_lengths = results->mutable_view().data<int32_t>();
// input column device view
auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// create output column
auto results = std::make_unique<cudf::column>(
cudf::data_type{type_id::INT32},
strings_count,
rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr),
cudf::detail::copy_bitmask(strings.parent(), stream, mr), // copy the null mask
strings.null_count());
auto results_view = results->mutable_view();
auto d_lengths = results_view.data<int32_t>();
// fill in the lengths
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
thrust::make_counting_iterator<cudf::size_type>(strings.size()),
d_lengths,
[d_strings, ufn] __device__(size_type idx) {
int32_t length = 0;
if (!d_strings.is_null(idx))
length = static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
return length;
return d_strings.is_null(idx)
? 0
: static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
});
results->set_null_count(strings.null_count()); // reset null count
return results;
Expand Down Expand Up @@ -140,23 +138,22 @@ std::unique_ptr<column> code_points(
auto d_column = *strings_column;

// create offsets vector to account for each string's character length
rmm::device_vector<size_type> offsets(strings.size() + 1);
size_type* d_offsets = offsets.data().get();
rmm::device_uvector<size_type> offsets(strings.size() + 1, stream);
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_offsets + 1,
offsets.begin() + 1,
[d_column] __device__(size_type idx) {
size_type length = 0;
if (!d_column.is_null(idx)) length = d_column.element<string_view>(idx).length();
return length;
},
thrust::plus<size_type>());
CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream.value()));
offsets.set_element(0, 0, stream);

// the total size is the number of characters in the entire column
size_type num_characters = offsets.back();
size_type num_characters = offsets.back_element(stream);
// create output column with no nulls
auto results = make_numeric_column(
data_type{type_id::INT32}, num_characters, mask_state::UNALLOCATED, stream, mr);
Expand All @@ -167,7 +164,7 @@ std::unique_ptr<column> code_points(
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings.size(),
code_points_fn{d_column, d_offsets, d_results});
code_points_fn{d_column, offsets.data(), d_results});

results->set_null_count(0);
return results;
Expand Down
90 changes: 40 additions & 50 deletions cpp/src/strings/copying/copying.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,77 +14,67 @@
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/sequence.h>

namespace cudf {
namespace strings {
namespace detail {
// new strings column from subset of this strings instance

std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
size_type start,
size_type end,
size_type step,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);
if (step == 0) step = 1;
CUDF_EXPECTS(step > 0, "Parameter step must be positive integer.");
if (end < 0 || end > strings_count) end = strings_count;
if (strings.is_empty()) return make_empty_strings_column(stream, mr);
if (end < 0 || end > strings.size()) end = strings.size();
CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
strings_count = cudf::util::round_up_safe<size_type>((end - start), step);
if (start == 0 && strings.offset() == 0 && step == 1) {
// sliced at the beginning and copying every step, so no need to gather
auto offsets_column = std::make_unique<cudf::column>(
cudf::slice(strings.offsets(), {0, strings_count + 1}).front(), stream, mr);
auto data_size =
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
auto chars_column = std::make_unique<cudf::column>(
cudf::slice(strings.chars(), {0, data_size}).front(), stream, mr);
auto null_mask = cudf::detail::copy_bitmask(strings.null_mask(), 0, strings_count, stream, mr);
return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
UNKNOWN_NULL_COUNT,
std::move(null_mask),
stream,
mr);
auto const strings_count = end - start;
auto const offsets_offset = start + strings.offset();

// slice the offsets child column
auto offsets_column = std::make_unique<cudf::column>(
cudf::slice(strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}).front(),
stream,
mr);
auto const chars_offset =
offsets_offset == 0 ? 0 : cudf::detail::get_value<int32_t>(offsets_column->view(), 0, stream);
if (chars_offset > 0) {
// adjust the individual offset values only if needed
auto d_offsets = offsets_column->mutable_view();
thrust::transform(rmm::exec_policy(stream),
d_offsets.begin<int32_t>(),
d_offsets.end<int32_t>(),
d_offsets.begin<int32_t>(),
[chars_offset] __device__(auto offset) { return offset - chars_offset; });
}

// do the gather instead
// build indices
rmm::device_vector<size_type> indices(strings_count);
thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end(), start, step);
// create a column_view as a wrapper of these indices
column_view indices_view(
data_type{type_id::INT32}, strings_count, indices.data().get(), nullptr, 0);
// build a new strings column from the indices
auto sliced_table = cudf::detail::gather(table_view{{strings.parent()}},
indices_view,
cudf::out_of_bounds_policy::DONT_CHECK,
cudf::detail::negative_index_policy::NOT_ALLOWED,
stream,
mr)
->release();
std::unique_ptr<column> output_column(std::move(sliced_table.front()));
if (output_column->null_count() == 0)
output_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
return output_column;
// slice the chars child column
auto const data_size =
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
auto chars_column = std::make_unique<cudf::column>(
cudf::slice(strings.chars(), {chars_offset, chars_offset + data_size}).front(), stream, mr);

// slice the null mask
auto null_mask = cudf::detail::copy_bitmask(
strings.null_mask(), offsets_offset, offsets_offset + strings_count, stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
UNKNOWN_NULL_COUNT,
std::move(null_mask),
stream,
mr);
}

} // namespace detail
Expand Down
58 changes: 21 additions & 37 deletions cpp/tests/strings/array_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/copying.hpp>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/scatter.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -71,20 +71,17 @@ TEST_P(SliceParmsTest, Slice)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
// thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::strings_column_wrapper expected(
h_strings.begin() + start,
h_strings.begin() + end,
thrust::make_transform_iterator(h_strings.begin() + start,
[](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_P(SliceParmsTest, SliceAllNulls)
Expand All @@ -94,42 +91,29 @@ TEST_P(SliceParmsTest, SliceAllNulls)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
h_strings.begin() + start,
h_strings.begin() + end,
thrust::make_transform_iterator(h_strings.begin() + start,
[](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_P(SliceParmsTest, SliceAllEmpty)
{
std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());

cudf::size_type start = 3;
cudf::size_type end = GetParam();
std::vector<const char*> h_expected;
if (end > start) {
for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
++idx)
h_expected.push_back(h_strings[idx]);
}
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::detail::copy_slice(strings_view, start, end);
cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
// thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);

cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

Expand Down