From 202578307c81b1f7aad8fbe6f71ca9c7f3fa5c4a Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 27 Feb 2023 13:46:45 -0500 Subject: [PATCH] Improve performance for cudf::strings::count_characters for long strings (#12779) Adds more efficient counting algorithm specifically for columns with long strings--greater than 64 bytes on average. The internal detail method will be used to help improve performance in other strings functions. Authors: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12779 --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/string/lengths.cpp | 56 +++++++++++++++++++ cpp/src/strings/attributes.cu | 91 +++++++++++++++++++++++++++++-- cpp/tests/strings/attrs_tests.cpp | 22 ++++++-- 4 files changed, 160 insertions(+), 11 deletions(-) create mode 100644 cpp/benchmarks/string/lengths.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c5ae3345da5..11da30f108a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -295,7 +295,7 @@ ConfigureBench( string/url_decode.cu ) -ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp) +ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp) # ################################################################################################## # * json benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp new file mode 100644 index 00000000000..4540e4a8f42 --- /dev/null +++ b/cpp/benchmarks/string/lengths.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +static void bench_lengths(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(num_rows); // output is an integer per row + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::count_characters(input); + }); +} + +NVBENCH_BENCH(bench_lengths) + .set_name("strings_lengths") + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}); diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 127d3aa8fe7..66288c7d14d 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -29,6 +31,7 @@ #include #include +#include #include #include #include @@ -37,10 +40,24 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * the warp-parallel function is used. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the strings_lengths benchmark results. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + /** * @brief Returns a numeric column containing lengths of each string in * based on the provided unary function. @@ -85,21 +102,85 @@ std::unique_ptr counts_fn(strings_column_view const& strings, return results; } +/** + * @brief Count characters using a warp per string + * + * @param d_strings Column with strings to count + * @param d_lengths Results of the counts per string + */ +__global__ void count_characters_parallel_fn(column_device_view const d_strings, + size_type* d_lengths) +{ + size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; + + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } + + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + if (d_strings.is_null(str_idx)) { + d_lengths[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + auto const str_ptr = d_str.data(); + + auto count = 0; + for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { + count += static_cast(is_begin_utf8_char(str_ptr[i])); + } + auto const char_count = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_lengths[str_idx] = char_count; } +} + +std::unique_ptr count_characters_parallel(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // create output column + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + + auto const d_lengths = results->mutable_view().data(); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + // fill in the lengths + constexpr int block_size = 256; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + count_characters_parallel_fn<<>>( + *d_strings, d_lengths); + + // reset null count after call to mutable_view() + results->set_null_count(input.null_count()); + + return results; +} + } // namespace -std::unique_ptr count_characters(strings_column_view const& strings, +std::unique_ptr count_characters(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); }; - return counts_fn(strings, ufn, stream, mr); + if ((input.size() == input.null_count()) || + ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) { + auto ufn = [] __device__(string_view const& d_str) { return d_str.length(); }; + return counts_fn(input, ufn, stream, mr); + } + + return count_characters_parallel(input, stream, mr); } std::unique_ptr count_bytes(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); }; + auto ufn = [] __device__(string_view const& d_str) { return d_str.size_bytes(); }; return counts_fn(strings, ufn, stream, mr); } diff --git a/cpp/tests/strings/attrs_tests.cpp b/cpp/tests/strings/attrs_tests.cpp index 9ff2c55ed81..eff992604a6 100644 --- a/cpp/tests/strings/attrs_tests.cpp +++ b/cpp/tests/strings/attrs_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,7 +65,7 @@ TEST_F(StringsAttributesTest, ZeroSizeStringsColumn) TEST_F(StringsAttributesTest, StringsLengths) { std::vector h_strings{ - "eee", "bb", nullptr, "", "aa", "ééé", " something a bit longer "}; + "eee", "bb", nullptr, "", "aa", "ééé", "something a bit longer than 32 bytes"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), @@ -74,17 +74,16 @@ TEST_F(StringsAttributesTest, StringsLengths) { auto results = cudf::strings::count_characters(strings_view); - std::vector h_expected{3, 2, 0, 0, 2, 3, 24}; + std::vector h_expected{3, 2, 0, 0, 2, 3, 36}; cudf::test::fixed_width_column_wrapper expected( h_expected.begin(), h_expected.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { auto results = cudf::strings::count_bytes(strings_view); - std::vector h_expected{3, 2, 0, 0, 2, 6, 24}; + std::vector h_expected{3, 2, 0, 0, 2, 6, 36}; cudf::test::fixed_width_column_wrapper expected( h_expected.begin(), h_expected.end(), @@ -93,3 +92,16 @@ TEST_F(StringsAttributesTest, StringsLengths) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } + +TEST_F(StringsAttributesTest, StringsLengthsLong) +{ + std::vector h_strings( + 40000, "something a bit longer than 32 bytes ééé ééé ééé ééé ééé ééé ééé"); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::count_characters(strings_view); + std::vector h_expected(h_strings.size(), 64); + cudf::test::fixed_width_column_wrapper expected(h_expected.begin(), h_expected.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +}