From 7be60e11b5d4890cc104042ce0fa442c4c0bc909 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 1 May 2023 14:52:21 -0400 Subject: [PATCH 1/6] Performance improvement in cudf::strings::all_characters_of_type --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/string/char_types.cpp | 67 +++++++++++++++++++ cpp/src/strings/char_types/char_types.cu | 84 ++++++++++++++---------- 3 files changed, 121 insertions(+), 34 deletions(-) create mode 100644 cpp/benchmarks/string/char_types.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 565a396d913..0d1036e6a40 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -301,7 +301,9 @@ ConfigureBench( string/url_decode.cu ) -ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp) +ConfigureNVBench( + STRINGS_NVBENCH string/char_types.cpp string/like.cpp string/reverse.cpp string/lengths.cpp +) # ################################################################################################## # * json benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp new file mode 100644 index 00000000000..2ce0e05eab8 --- /dev/null +++ b/cpp/benchmarks/string/char_types.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +static void bench_char_types(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const api_type = state.get_string("api"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + auto input_types = cudf::strings::string_character_types::SPACE; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + if (api_type == "all") { + state.add_global_memory_writes(num_rows); // output is a bool8 per row + } else { + state.add_global_memory_writes(chars_size); + } + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (api_type == "all") { + auto result = cudf::strings::all_characters_of_type(input, input_types); + } else { + auto result = cudf::strings::filter_characters_of_type(input, input_types); + } + }); +} + +NVBENCH_BENCH(bench_char_types) + .set_name("char_types") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index a403061ba0e..a0e3f48f22b 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -37,53 +37,71 @@ namespace cudf { namespace strings { namespace detail { -// -std::unique_ptr all_characters_of_type(strings_column_view const& strings, +namespace { + +struct char_types_fn { + column_device_view const d_column; + character_flags_table_type const* d_flags; + string_character_types const types; + string_character_types const verify_types; + + __device__ bool operator()(size_type idx) + { + if (d_column.is_null(idx)) return false; + auto const d_str = d_column.element(idx); + auto const end = d_str.data() + d_str.size_bytes(); + + bool type_matched = !d_str.empty(); // require at least one character; + size_type check_count = 0; // count checked characters + for (auto itr = d_str.data(); type_matched && (itr < end); ++itr) { + uint8_t const chr = static_cast(*itr); + if (is_utf8_continuation_char(chr)) { continue; } + auto u8 = static_cast(chr); // holds UTF8 value + if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } + + // lookup flags in table by code-point + auto const code_point = utf8_to_codepoint(u8); + auto const flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; + + if ((verify_types & flag) || // should flag be verified; + (flag == 0 && verify_types == ALL_TYPES)) // special edge case + { + type_matched = (types & flag) > 0; + ++check_count; + } + } + + return type_matched && (check_count > 0); + } +}; +} // namespace + +std::unique_ptr all_characters_of_type(strings_column_view const& input, string_character_types types, string_character_types verify_types, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); // get the static character types table auto d_flags = detail::get_character_flags_table(); + // set the output values by checking the character types for each string thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - [d_column, d_flags, types, verify_types, d_results] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - auto d_str = d_column.element(idx); - bool check = !d_str.empty(); // require at least one character - size_type check_count = 0; - for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) { - auto code_point = detail::utf8_to_codepoint(*itr); - // lookup flags in table by code-point - auto flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - if ((verify_types & flag) || // should flag be verified - (flag == 0 && verify_types == ALL_TYPES)) // special edge case - { - check = (types & flag) > 0; - ++check_count; - } - } - return check && (check_count > 0); - }); - // - results->set_null_count(strings.null_count()); + thrust::make_counting_iterator(input.size()), + results->mutable_view().data(), + char_types_fn{*d_strings, d_flags, types, verify_types}); + + results->set_null_count(input.null_count()); return results; } From 433516d1ca0e66c7f2f332cf0d4331ef2c6fb2b6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 2 May 2023 11:40:44 -0400 Subject: [PATCH 2/6] use max(int8) instead of max(char) --- cpp/src/strings/char_types/char_types.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index a0e3f48f22b..e3290503f0a 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -57,7 +57,8 @@ struct char_types_fn { uint8_t const chr = static_cast(*itr); if (is_utf8_continuation_char(chr)) { continue; } auto u8 = static_cast(chr); // holds UTF8 value - if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } + // using max(int8) here since max(char)=255 on ARM systems + if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } // lookup flags in table by code-point auto const code_point = utf8_to_codepoint(u8); From f445b9fe4b3806ada7de21e8b2fd75954391d308 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 3 May 2023 10:03:17 -0400 Subject: [PATCH 3/6] fix/add comments --- cpp/src/strings/char_types/char_types.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index e3290503f0a..40188deb7a9 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -31,7 +31,6 @@ #include #include -#include #include namespace cudf { @@ -39,6 +38,13 @@ namespace strings { namespace detail { namespace { +/** + * @brief Returns true for each string where all characters match the given types. + * + * Only the characters that match to `verify_types` are checked. + * Returns false if no characters are checked or one character does not match `types`. + * Returns true if at least one character is checked and all checked characters match `types`. + */ struct char_types_fn { column_device_view const d_column; character_flags_table_type const* d_flags; From 39a89017505631997d05f089cb05238a4b3b6b46 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 8 May 2023 13:39:34 -0400 Subject: [PATCH 4/6] add some const decls --- cpp/src/strings/char_types/char_types.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 40188deb7a9..d469b3d0aa9 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -51,9 +51,9 @@ struct char_types_fn { string_character_types const types; string_character_types const verify_types; - __device__ bool operator()(size_type idx) + __device__ bool operator()(size_type idx) const { - if (d_column.is_null(idx)) return false; + if (d_column.is_null(idx)) { return false; } auto const d_str = d_column.element(idx); auto const end = d_str.data() + d_str.size_bytes(); From 07cc19ccce3f8af2b355878fadb2fcae4756d236 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 10 May 2023 17:10:59 -0400 Subject: [PATCH 5/6] remove unneeded header --- cpp/benchmarks/string/char_types.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index 2ce0e05eab8..8e9e595fcef 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include From 91051cb265b4d3761952ee1500dc926d5f4ea608 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 11 May 2023 09:38:17 -0400 Subject: [PATCH 6/6] fix comment --- cpp/src/strings/char_types/char_types.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index d469b3d0aa9..b87fb80fcc2 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -66,7 +66,7 @@ struct char_types_fn { // using max(int8) here since max(char)=255 on ARM systems if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } - // lookup flags in table by code-point + // lookup flags in table by codepoint auto const code_point = utf8_to_codepoint(u8); auto const flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;