Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix multibyte check for case convert for large strings #15721

Merged
merged 12 commits into from
May 16, 2024
2 changes: 1 addition & 1 deletion cpp/benchmarks/string/case.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,5 @@ void bench_case(nvbench::state& state)
NVBENCH_BENCH(bench_case)
.set_name("case")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_string_axis("encoding", {"ascii", "utf8"});
90 changes: 66 additions & 24 deletions cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/cub.cuh>
#include <cuda/atomic>
#include <cuda/functional>
#include <thrust/for_each.h>
Expand Down Expand Up @@ -237,13 +238,16 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
auto const d_str = d_strings.element<string_view>(str_idx);
auto const str_ptr = d_str.data();

// each thread processes 4 bytes
size_type size = 0;
for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
auto const chr = str_ptr[i];
if (is_utf8_continuation_char(chr)) { continue; }
char_utf8 u8 = 0;
to_char_utf8(str_ptr + i, u8);
size += converter.process_character(u8);
for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) {
for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) {
auto const chr = str_ptr[j];
if (is_utf8_continuation_char(chr)) { continue; }
char_utf8 u8 = 0;
to_char_utf8(str_ptr + j, u8);
size += converter.process_character(u8);
}
}
// this is slightly faster than using the cub::warp_reduce
if (size > 0) {
Expand All @@ -260,6 +264,43 @@ struct ascii_converter_fn {
__device__ char operator()(char chr) { return converter.process_ascii(chr); }
};

constexpr int64_t block_size = 512;
constexpr int64_t bytes_per_thread = 8;

/**
* @brief Checks the chars data for any multibyte characters
*
* The output count is not accurate but it is only checked for > 0.
*/
CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
int64_t first_offset,
int64_t last_offset,
int64_t* d_output)
{
auto const idx = cudf::detail::grid_1d::global_thread_id();
// read only every 2nd byte; all bytes in a multibyte char have high bit set
auto const byte_idx = (static_cast<int64_t>(idx) * bytes_per_thread) + first_offset;
auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
if (byte_idx >= last_offset) { return; }

using block_reduce = cub::BlockReduce<int64_t, block_size>;
__shared__ typename block_reduce::TempStorage temp_storage;

// each thread processes 8 bytes (only 4 need to be checked)
int64_t mb_count = 0;
for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) {
u_char const chr = static_cast<u_char>(d_input_chars[i]);
mb_count += ((chr & 0x80) > 0);
}

auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());

if ((lane_idx == 0) && (mb_total > 0)) {
cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
}
}

/**
* @brief Utility method for converting upper and lower case characters
* in a strings column
Expand Down Expand Up @@ -289,7 +330,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
input.offsets(), input.offset(), stream);
auto const last_offset =
cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;
auto const chars_size = last_offset - first_offset;
auto const input_chars = input.chars_begin(stream);

convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
upper_lower_fn converter{ccfn, *d_strings};
Expand All @@ -306,16 +348,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,

// Check if the input contains any multi-byte characters.
// This check incurs ~20% performance hit for smaller strings and so we only use it
// after the threshold check above. The check makes very little impact for larger strings
// after the threshold check above. The check makes very little impact for long strings
// but results in a large performance gain when the input contains only single-byte characters.
// The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
bool const multi_byte_chars =
thrust::count_if(rmm::exec_policy(stream),
input.chars_begin(stream),
input.chars_end(stream),
cuda::proclaim_return_type<bool>(
[] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
if (!multi_byte_chars) {
rmm::device_scalar<int64_t> mb_count(0, stream);
// cudf::detail::grid_1d is limited to size_type elements
auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
// we only need to check every other byte since either will contain high bit
has_multibytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
input_chars, first_offset, last_offset, mb_count.data());
if (mb_count.value(stream) == 0) {
// optimization for ASCII-only case: copy the input column and inplace replace each character
auto result = std::make_unique<column>(input.parent(), stream, mr);
auto d_chars = result->mutable_view().head<char>();
Expand All @@ -329,21 +370,21 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
// note: tried to use segmented-reduce approach instead here and it was consistently slower
auto [offsets, bytes] = [&] {
rmm::device_uvector<size_type> sizes(input.size(), stream);
constexpr int block_size = 512;
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
// cudf::detail::grid_1d is limited to size_type threads
auto const num_blocks = util::div_rounding_up_safe(
static_cast<int64_t>(input.size()) * cudf::detail::warp_size, block_size);
count_bytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
ccfn, *d_strings, sizes.data());
// convert sizes to offsets
return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
}();

// build sub-offsets
auto const input_chars = input.chars_begin(stream);
auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
auto tmp_offsets = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
auto tmp_offsets = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
{
rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
auto const count_itr = thrust::make_counting_iterator<size_type>(0);
rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
thrust::transform(rmm::exec_policy_nosync(stream),
count_itr,
count_itr + sub_count,
Expand All @@ -359,6 +400,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
sub_offsets.begin(),
sub_offsets.end(),
tmp_offsets.begin());
stream.synchronize(); // protect against destruction of sub_offsets
}

// run case conversion over the new sub-strings
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
// Use a heuristic to guess when the fused kernel will be faster than memcpy
if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) {
// Use single kernel launch to copy chars columns
constexpr size_type block_size{256};
cudf::detail::grid_1d config(total_bytes, block_size);
auto const kernel = fused_concatenate_string_chars_kernel;
kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
d_views,
d_partition_offsets.data(),
static_cast<size_type>(columns.size()),
total_bytes,
d_new_chars);
constexpr size_t block_size{256};
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
// cudf::detail::grid_1d limited to size_type elements
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we fix this? to use size_t ?

Copy link
Contributor Author

@davidwendt davidwendt May 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps worth a discussion. I'm not sure how common this will be and how much impact it will have.
Certainly not something I'd wish to solve in this PR.

auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size);
auto const kernel = fused_concatenate_string_chars_kernel;
kernel<<<num_blocks, block_size, 0, stream.value()>>>(d_views,
d_partition_offsets.data(),
static_cast<size_type>(columns.size()),
total_bytes,
d_new_chars);
} else {
// Memcpy each input chars column (more efficient for very large strings)
for (auto column = columns.begin(); column != columns.end(); ++column) {
Expand Down
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -572,9 +572,10 @@ ConfigureTest(
# * large strings test ----------------------------------------------------------------------------
ConfigureTest(
LARGE_STRINGS_TEST
large_strings/concatenate_tests.cpp
large_strings/case_tests.cpp
large_strings/large_strings_fixture.cpp
large_strings/merge_tests.cpp
large_strings/concatenate_tests.cpp
large_strings/parquet_tests.cpp
large_strings/reshape_tests.cpp
GPUS 1
Expand Down
52 changes: 52 additions & 0 deletions cpp/tests/large_strings/case_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_fixture.hpp"

#include <cudf_test/column_utilities.hpp>

#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/strings/case.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <vector>

struct CaseTest : public cudf::test::StringsLargeTest {};

TEST_F(CaseTest, ToLower)
{
auto const wide = this->wide_column();
auto input = cudf::concatenate(std::vector<cudf::column_view>(120000, wide)); // 230MB
auto expected = cudf::strings::to_lower(cudf::strings_column_view(input->view()));

int const multiplier = 12;
std::vector<cudf::column_view> input_cols(multiplier, input->view());
std::vector<cudf::size_type> splits;
std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable {
return input->view().size() * (n++);
});

auto large_input = cudf::concatenate(input_cols); // 2700MB > 2GB
auto const sv = cudf::strings_column_view(large_input->view());
auto result = cudf::strings::to_lower(sv);

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view());
}
}
13 changes: 13 additions & 0 deletions cpp/tests/large_strings/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,16 @@ TEST_F(ConcatenateTest, ConcatenateVertical)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}

TEST_F(ConcatenateTest, ManyColumns)
{
auto input = this->wide_column();
auto view = cudf::column_view(input);
int const multiplier = 1200000;
std::vector<cudf::column_view> input_cols(multiplier, view); // 2500MB > 2GB
// this tests a unique path through the code
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
}
Loading