diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dcc70a4b6d9..c3db3370c62 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -300,7 +300,8 @@ ConfigureBench( ) ConfigureNVBench( - STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp + STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp + string/reverse.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp new file mode 100644 index 00000000000..8e9e595fcef --- /dev/null +++ b/cpp/benchmarks/string/char_types.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_char_types(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const api_type = state.get_string("api"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + auto input_types = cudf::strings::string_character_types::SPACE; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + if (api_type == "all") { + state.add_global_memory_writes(num_rows); // output is a bool8 per row + } else { + state.add_global_memory_writes(chars_size); + } + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (api_type == "all") { + auto result = cudf::strings::all_characters_of_type(input, input_types); + } else { + auto result = cudf::strings::filter_characters_of_type(input, input_types); + } + }); +} + +NVBENCH_BENCH(bench_char_types) + .set_name("char_types") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index a403061ba0e..b87fb80fcc2 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -31,59 +31,84 @@ #include #include -#include #include namespace cudf { namespace strings { namespace detail { -// -std::unique_ptr all_characters_of_type(strings_column_view const& strings, +namespace { + +/** + * @brief Returns true for each string where all characters match the given types. + * + * Only the characters that match to `verify_types` are checked. + * Returns false if no characters are checked or one character does not match `types`. + * Returns true if at least one character is checked and all checked characters match `types`. + */ +struct char_types_fn { + column_device_view const d_column; + character_flags_table_type const* d_flags; + string_character_types const types; + string_character_types const verify_types; + + __device__ bool operator()(size_type idx) const + { + if (d_column.is_null(idx)) { return false; } + auto const d_str = d_column.element(idx); + auto const end = d_str.data() + d_str.size_bytes(); + + bool type_matched = !d_str.empty(); // require at least one character; + size_type check_count = 0; // count checked characters + for (auto itr = d_str.data(); type_matched && (itr < end); ++itr) { + uint8_t const chr = static_cast(*itr); + if (is_utf8_continuation_char(chr)) { continue; } + auto u8 = static_cast(chr); // holds UTF8 value + // using max(int8) here since max(char)=255 on ARM systems + if (u8 > std::numeric_limits::max()) { to_char_utf8(itr, u8); } + + // lookup flags in table by codepoint + auto const code_point = utf8_to_codepoint(u8); + auto const flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; + + if ((verify_types & flag) || // should flag be verified; + (flag == 0 && verify_types == ALL_TYPES)) // special edge case + { + type_matched = (types & flag) > 0; + ++check_count; + } + } + + return type_matched && (check_count > 0); + } +}; +} // namespace + +std::unique_ptr all_characters_of_type(strings_column_view const& input, string_character_types types, string_character_types verify_types, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); // get the static character types table auto d_flags = detail::get_character_flags_table(); + // set the output values by checking the character types for each string thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - [d_column, d_flags, types, verify_types, d_results] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - auto d_str = d_column.element(idx); - bool check = !d_str.empty(); // require at least one character - size_type check_count = 0; - for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) { - auto code_point = detail::utf8_to_codepoint(*itr); - // lookup flags in table by code-point - auto flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - if ((verify_types & flag) || // should flag be verified - (flag == 0 && verify_types == ALL_TYPES)) // special edge case - { - check = (types & flag) > 0; - ++check_count; - } - } - return check && (check_count > 0); - }); - // - results->set_null_count(strings.null_count()); + thrust::make_counting_iterator(input.size()), + results->mutable_view().data(), + char_types_fn{*d_strings, d_flags, types, verify_types}); + + results->set_null_count(input.null_count()); return results; }