Skip to content

Commit

Permalink
Improve performance for cudf::strings::count_characters for long stri…
Browse files Browse the repository at this point in the history
…ngs (#12779)

Adds more efficient counting algorithm specifically for columns with long strings--greater than 64 bytes on average.
The internal detail method will be used to help improve performance in other strings functions.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #12779
  • Loading branch information
davidwendt authored Feb 27, 2023
1 parent ac1cac6 commit 2025783
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 11 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ ConfigureBench(
string/url_decode.cu
)

ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp)
ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)

# ##################################################################################################
# * json benchmark -------------------------------------------------------------------
Expand Down
56 changes: 56 additions & 0 deletions cpp/benchmarks/string/lengths.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/strings/attributes.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_lengths(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output is an integer per row

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::strings::count_characters(input);
});
}

NVBENCH_BENCH(bench_lengths)
.set_name("strings_lengths")
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096});
91 changes: 86 additions & 5 deletions cpp/src/strings/attributes.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,7 +19,9 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/attributes.hpp>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -29,6 +31,7 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
Expand All @@ -37,10 +40,24 @@
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <cub/warp/warp_reduce.cuh>

namespace cudf {
namespace strings {
namespace detail {
namespace {

/**
* @brief Threshold to decide on using string or warp parallel functions.
*
* If the average byte length of a string in a column exceeds this value then
* the warp-parallel function is used.
* Otherwise, a regular string-parallel function is used.
*
* This value was found using the strings_lengths benchmark results.
*/
constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;

/**
* @brief Returns a numeric column containing lengths of each string in
* based on the provided unary function.
Expand Down Expand Up @@ -85,21 +102,85 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
return results;
}

/**
* @brief Count characters using a warp per string
*
* @param d_strings Column with strings to count
* @param d_lengths Results of the counts per string
*/
__global__ void count_characters_parallel_fn(column_device_view const d_strings,
size_type* d_lengths)
{
size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
using warp_reduce = cub::WarpReduce<size_type>;
__shared__ typename warp_reduce::TempStorage temp_storage;

if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const lane_idx = idx % cudf::detail::warp_size;
if (d_strings.is_null(str_idx)) {
d_lengths[str_idx] = 0;
return;
}
auto const d_str = d_strings.element<string_view>(str_idx);
auto const str_ptr = d_str.data();

auto count = 0;
for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
count += static_cast<size_type>(is_begin_utf8_char(str_ptr[i]));
}
auto const char_count = warp_reduce(temp_storage).Sum(count);
if (lane_idx == 0) { d_lengths[str_idx] = char_count; }
}

std::unique_ptr<column> count_characters_parallel(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// create output column
auto results = make_numeric_column(data_type{type_to_id<size_type>()},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);

auto const d_lengths = results->mutable_view().data<size_type>();
auto const d_strings = cudf::column_device_view::create(input.parent(), stream);

// fill in the lengths
constexpr int block_size = 256;
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
count_characters_parallel_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, d_lengths);

// reset null count after call to mutable_view()
results->set_null_count(input.null_count());

return results;
}

} // namespace

std::unique_ptr<column> count_characters(strings_column_view const& strings,
std::unique_ptr<column> count_characters(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); };
return counts_fn(strings, ufn, stream, mr);
if ((input.size() == input.null_count()) ||
((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) {
auto ufn = [] __device__(string_view const& d_str) { return d_str.length(); };
return counts_fn(input, ufn, stream, mr);
}

return count_characters_parallel(input, stream, mr);
}

std::unique_ptr<column> count_bytes(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); };
auto ufn = [] __device__(string_view const& d_str) { return d_str.size_bytes(); };
return counts_fn(strings, ufn, stream, mr);
}

Expand Down
22 changes: 17 additions & 5 deletions cpp/tests/strings/attrs_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -65,7 +65,7 @@ TEST_F(StringsAttributesTest, ZeroSizeStringsColumn)
TEST_F(StringsAttributesTest, StringsLengths)
{
std::vector<const char*> h_strings{
"eee", "bb", nullptr, "", "aa", "ééé", " something a bit longer "};
"eee", "bb", nullptr, "", "aa", "ééé", "something a bit longer than 32 bytes"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
Expand All @@ -74,17 +74,16 @@ TEST_F(StringsAttributesTest, StringsLengths)

{
auto results = cudf::strings::count_characters(strings_view);
std::vector<int32_t> h_expected{3, 2, 0, 0, 2, 3, 24};
std::vector<int32_t> h_expected{3, 2, 0, 0, 2, 3, 36};
cudf::test::fixed_width_column_wrapper<int32_t> expected(
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
auto results = cudf::strings::count_bytes(strings_view);
std::vector<int32_t> h_expected{3, 2, 0, 0, 2, 6, 24};
std::vector<int32_t> h_expected{3, 2, 0, 0, 2, 6, 36};
cudf::test::fixed_width_column_wrapper<int32_t> expected(
h_expected.begin(),
h_expected.end(),
Expand All @@ -93,3 +92,16 @@ TEST_F(StringsAttributesTest, StringsLengths)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}

TEST_F(StringsAttributesTest, StringsLengthsLong)
{
std::vector<std::string> h_strings(
40000, "something a bit longer than 32 bytes ééé ééé ééé ééé ééé ééé ééé");
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
auto strings_view = cudf::strings_column_view(strings);

auto results = cudf::strings::count_characters(strings_view);
std::vector<int32_t> h_expected(h_strings.size(), 64);
cudf::test::fixed_width_column_wrapper<int32_t> expected(h_expected.begin(), h_expected.end());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

0 comments on commit 2025783

Please sign in to comment.