Skip to content

Commit

Permalink
Merge branch 'branch-24.02' into streams-io-parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi committed Jan 5, 2024
2 parents 6d89a40 + 4de4aae commit 47ed293
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 63 deletions.
95 changes: 45 additions & 50 deletions cpp/src/strings/filling/fill.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,91 +15,86 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/detail/fill.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include <cuda/functional>

namespace cudf {
namespace strings {
namespace detail {
std::unique_ptr<column> fill(strings_column_view const& strings,
namespace {
struct fill_fn {
column_device_view const d_strings;
size_type const begin;
size_type const end;
string_view const d_value;
size_type* d_offsets{};
char* d_chars{};

__device__ string_view resolve_string_at(size_type idx) const
{
if ((begin <= idx) && (idx < end)) { return d_value; }
return d_strings.is_valid(idx) ? d_strings.element<string_view>(idx) : string_view{};
}

__device__ void operator()(size_type idx) const
{
auto const d_str = resolve_string_at(idx);
if (!d_chars) {
d_offsets[idx] = d_str.size_bytes();
} else {
copy_string(d_chars + d_offsets[idx], d_str);
}
}
};
} // namespace

std::unique_ptr<column> fill(strings_column_view const& input,
size_type begin,
size_type end,
string_scalar const& value,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
if (strings_count == 0) return make_empty_column(type_id::STRING);
auto const strings_count = input.size();
if (strings_count == 0) { return make_empty_column(type_id::STRING); }
CUDF_EXPECTS((begin >= 0) && (end <= strings_count),
"Parameters [begin,end) are outside the range of the provided strings column");
CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
if (begin == end) // return a copy
return std::make_unique<column>(strings.parent(), stream, mr);

// string_scalar.data() is null for valid, empty strings
auto d_value = get_scalar_device_view(const_cast<string_scalar&>(value));
if (begin == end) { return std::make_unique<column>(input.parent(), stream, mr); }

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
auto strings_column = column_device_view::create(input.parent(), stream);
auto const d_strings = *strings_column;
auto const is_valid = value.is_valid(stream);

// create resulting null mask
auto valid_mask = [begin, end, d_value, &value, d_strings, stream, mr] {
if (begin == 0 and end == d_strings.size() and value.is_valid(stream))
auto [null_mask, null_count] = [begin, end, is_valid, d_strings, stream, mr] {
if (begin == 0 and end == d_strings.size() and is_valid) {
return std::pair(rmm::device_buffer{}, 0);
}
return cudf::detail::valid_if(
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(d_strings.size()),
[d_strings, begin, end, d_value] __device__(size_type idx) {
return ((begin <= idx) && (idx < end)) ? d_value.is_valid() : !d_strings.is_null(idx);
[d_strings, begin, end, is_valid] __device__(size_type idx) {
return ((begin <= idx) && (idx < end)) ? is_valid : d_strings.is_valid(idx);
},
stream,
mr);
}();
auto null_count = valid_mask.second;
rmm::device_buffer& null_mask = valid_mask.first;

// build offsets column
auto offsets_transformer = cuda::proclaim_return_type<size_type>(
[d_strings, begin, end, d_value] __device__(size_type idx) {
if (((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx)) return 0;
return ((begin <= idx) && (idx < end)) ? d_value.size()
: d_strings.element<string_view>(idx).size_bytes();
});
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();
auto const d_value = const_cast<string_scalar&>(value);
auto const d_str = is_valid ? d_value.value(stream) : string_view{};
auto fn = fill_fn{d_strings, begin, end, d_str};

// create the chars column
auto chars_column = create_chars_child_column(bytes, stream, mr);
// fill the chars column
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[d_strings, begin, end, d_value, d_offsets, d_chars] __device__(size_type idx) {
if (((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx)) return;
string_view const d_str =
((begin <= idx) && (idx < end)) ? d_value.value() : d_strings.element<string_view>(idx);
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
});
auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/search/find.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -364,7 +364,7 @@ __global__ void contains_warp_parallel_fn(column_device_view const d_strings,
// each thread of the warp will check just part of the string
auto found = false;
for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
!found && (i + d_target.size_bytes()) < d_str.size_bytes();
!found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
i += cudf::detail::warp_size) {
// check the target matches this part of the d_str data
if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
Expand Down
47 changes: 36 additions & 11 deletions cpp/tests/strings/find_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,22 +131,38 @@ TEST_F(StringsFindTest, FindLongStrings)
results = cudf::strings::find(view, cudf::strings_column_view(targets));
expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 56, 0, 0, -1, 73, -1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);

results = cudf::strings::find(view, cudf::string_scalar("ing"));
expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({-1, 86, 10, 73, -1, 58, -1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);

results = cudf::strings::rfind(view, cudf::string_scalar("ing"));
expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({-1, 86, 10, 86, -1, 58, -1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsFindTest, Contains)
{
cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
{1, 1, 0, 1, 1, 1});
cudf::test::strings_column_wrapper strings(
{"Héllo", "thesé", "", "lease", "tést strings", "", "", "éte"}, {1, 1, 0, 1, 1, 1, 1, 1});
auto strings_view = cudf::strings_column_view(strings);
{
cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1});
cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 1, 0, 0, 1, 1},
{1, 1, 0, 1, 1, 1, 1, 1});
auto results = cudf::strings::contains(strings_view, cudf::string_scalar("e"));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper targets({"Hello", "é", "e", "x", "", ""},
{1, 1, 1, 1, 1, 0});
cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 0}, {1, 1, 0, 1, 1, 1});
cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 0, 0, 1, 0, 1, 1},
{1, 1, 0, 1, 1, 1, 1, 1});
auto results = cudf::strings::contains(strings_view, cudf::string_scalar("é"));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper targets({"Hello", "é", "e", "x", "", "", "n", "t"},
{1, 1, 1, 1, 1, 0, 1, 1});
cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 0, 0, 1},
{1, 1, 0, 1, 1, 1, 1, 1});
auto results = cudf::strings::contains(strings_view, cudf::strings_column_view(targets));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
Expand All @@ -161,15 +177,24 @@ TEST_F(StringsFindTest, ContainsLongStrings)
"it returns the last position where value could be inserted without violating the ordering",
"algorithms execution is parallelized as determined by an execution policy. t",
"he this is a continuation of previous row to make sure string boundaries are honored",
"abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
""});
auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::contains(strings_view, cudf::string_scalar("e"));
cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 1, 1, 1, 1, 0});
auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);

results = cudf::strings::contains(strings_view, cudf::string_scalar(" the "));
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 1, 0, 0, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);

results = cudf::strings::contains(strings_view, cudf::string_scalar("a"));
expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);

results = cudf::strings::contains(strings_view, cudf::string_scalar(" the "));
cudf::test::fixed_width_column_wrapper<bool> expected2({0, 1, 0, 1, 0, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected2);
results = cudf::strings::contains(strings_view, cudf::string_scalar("~"));
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsFindTest, StartsWith)
Expand Down

0 comments on commit 47ed293

Please sign in to comment.