Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change thrust::count_if call to raw kernel in strings split APIs #15762

Merged
merged 17 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/strings/split/split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <rmm/resource_ref.hpp>

#include <cuda/functional>
#include <thrust/binary_search.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
#include <thrust/functional.h>
Expand Down
59 changes: 48 additions & 11 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,9 @@
#include <rmm/resource_ref.hpp>

#include <cuda/atomic>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

namespace cudf::strings::detail {
Expand Down Expand Up @@ -297,6 +294,44 @@ std::unique_ptr<column> create_offsets_from_positions(strings_column_view const&
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Count the number of delimiters in a strings column
*
* @tparam Tokenizer Functor containing `is_delimiter` function
* @tparam block_size Number of threads per block
* @tparam bytes_per_thread Number of bytes processed per thread
*
* @param tokenizer For checking delimiters
* @param d_offsets Offsets for the strings column
* @param chars_bytes Number of bytes in the strings column
* @param d_output Result of the count
*/
template <typename Tokenizer, int64_t block_size, size_type bytes_per_thread>
CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
cudf::detail::input_offsetalator d_offsets,
int64_t chars_bytes,
int64_t* d_output)
{
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);

using block_reduce = cub::BlockReduce<int64_t, block_size>;
__shared__ typename block_reduce::TempStorage temp_storage;

int64_t count = 0;
// each thread processes multiple bytes
for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
count += tokenizer.is_delimiter(i, d_offsets, chars_bytes);
}
auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());

if ((lane_idx == 0) && (total > 0)) {
cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
ref.fetch_add(total, cuda::std::memory_order_relaxed);
}
}

/**
* @brief Helper function used by split/rsplit and split_record/rsplit_record
*
Expand Down Expand Up @@ -326,17 +361,19 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());

// count the number of delimiters in the entire column
auto const delimiter_count =
thrust::count_if(rmm::exec_policy(stream),
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes),
[tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
});
rmm::device_scalar<int64_t> d_count(0, stream);
constexpr int64_t block_size = 512;
constexpr size_type bytes_per_thread = 4;
auto const num_blocks = util::div_rounding_up_safe(
util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
<<<num_blocks, block_size, 0, stream.value()>>>(
tokenizer, d_offsets, chars_bytes, d_count.data());

// Create a vector of every delimiter position in the chars column.
// These may include overlapping or otherwise out-of-bounds delimiters which
// will be resolved during token processing.
auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
auto delimiter_positions = rmm::device_uvector<int64_t>(d_count.value(stream), stream);
auto d_positions = delimiter_positions.data();
cudf::detail::copy_if_safe(
thrust::counting_iterator<int64_t>(0),
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ ConfigureTest(
large_strings/merge_tests.cpp
large_strings/parquet_tests.cpp
large_strings/reshape_tests.cpp
large_strings/split_strings_tests.cpp
GPUS 1
PERCENT 100
)
Expand Down
53 changes: 53 additions & 0 deletions cpp/tests/large_strings/split_strings_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_fixture.hpp"

#include <cudf_test/column_utilities.hpp>

#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/split/split.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>

#include <vector>

struct StringsSplitTest : public cudf::test::StringsLargeTest {};

TEST_F(StringsSplitTest, Split)
{
auto const expected = this->long_column();
auto const view = cudf::column_view(expected);
auto const multiplier = 10;
auto const separator = cudf::string_scalar("|");
auto const input = cudf::strings::concatenate(
cudf::table_view(std::vector<cudf::column_view>(multiplier, view)), separator);

{
auto result = cudf::strings::split(cudf::strings_column_view(input->view()), separator);
for (auto c : result->view()) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
}
}

auto lc = cudf::strings::split_record(cudf::strings_column_view(input->view()), separator);
auto lv = cudf::lists_column_view(lc->view());
auto sv = cudf::strings_column_view(lv.child());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
}
Loading