diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 565a396d913..90225b8537a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -280,7 +280,6 @@ ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp) # * strings benchmark ------------------------------------------------------------------- ConfigureBench( STRINGS_BENCH - string/case.cpp string/combine.cpp string/contains.cpp string/convert_datetime.cpp @@ -301,7 +300,9 @@ ConfigureBench( string/url_decode.cu ) -ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp) +ConfigureNVBench( + STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp +) # ################################################################################################## # * json benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 6152ea741a3..0cdd5fbac32 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -15,36 +15,64 @@ */ #include -#include -#include +#include #include #include #include -class StringCase : public cudf::benchmark {}; +#include -static void BM_case(benchmark::State& state) +void bench_case(nvbench::state& state) { - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}); - cudf::strings_column_view input(column->view()); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("width")); + auto const encoding = state.get_string("encoding"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::to_lower(input); + if (static_cast(n_rows) * static_cast(max_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); } - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + + auto col_view = column->view(); + + cudf::column::contents ascii_contents; + if (encoding == "ascii") { + data_profile ascii_profile = data_profile_builder().no_validity().distribution( + cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range + auto input = cudf::strings_column_view(col_view); + auto ascii_column = + create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile); + auto ascii_data = ascii_column->view(); -#define SORT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCase, name) \ - (::benchmark::State & st) { BM_case(st); } \ - BENCHMARK_REGISTER_F(StringCase, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 12, 1 << 24}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + col_view = cudf::column_view(col_view.type(), + col_view.size(), + nullptr, + col_view.null_mask(), + col_view.null_count(), + 0, + {input.offsets(), ascii_data}); + + ascii_contents = ascii_column->release(); + } + auto input = cudf::strings_column_view(col_view); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + state.add_element_count(input.chars_size(), "chars_size"); + state.add_global_memory_reads(input.chars_size()); + state.add_global_memory_writes(input.chars_size()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); }); +} -SORT_BENCHMARK_DEFINE(to_lower) +NVBENCH_BENCH(bench_case) + .set_name("strings_case") + .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 02c4532bb79..0997983c95e 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,10 @@ #include #include #include -#include #include #include +#include +#include #include #include #include @@ -38,31 +39,33 @@ namespace detail { namespace { /** - * @brief Per string logic for case conversion functions. + * @brief Threshold to decide on using string or warp parallel functions. * + * If the average byte length of a string in a column exceeds this value then + * the warp-parallel function is used to compute the output sizes. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the strings_lengths benchmark results. */ -struct upper_lower_fn { - const column_device_view d_column; - character_flags_table_type case_flag; // flag to check with on each character - const character_flags_table_type* d_flags; - const character_cases_table_type* d_case_table; - const special_case_mapping* d_special_case_mapping; - int32_t* d_offsets{}; - char* d_chars{}; +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; - __device__ special_case_mapping get_special_case_mapping(uint32_t code_point) - { - return d_special_case_mapping[get_special_case_hash_index(code_point)]; - } +/** + * @brief Utility functions for converting characters to upper or lower case + */ +struct convert_char_fn { + character_flags_table_type case_flag; + character_flags_table_type const* d_flags; + character_cases_table_type const* d_case_table; + special_case_mapping const* d_special_case_mapping; - // compute-size / copy the bytes representing the special case mapping for this codepoint - __device__ int32_t handle_special_case_bytes(uint32_t code_point, - char* d_buffer, - detail::character_flags_table_type flag) + // compute size or copy the bytes representing the special case mapping for this codepoint + __device__ size_type handle_special_case_bytes(uint32_t code_point, + detail::character_flags_table_type flag, + char* d_buffer = nullptr) const { - special_case_mapping m = get_special_case_mapping(code_point); - size_type bytes = 0; + special_case_mapping m = d_special_case_mapping[get_special_case_hash_index(code_point)]; + size_type bytes = 0; auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars; auto const* chars = IS_LOWER(flag) ? m.upper : m.lower; for (uint16_t idx = 0; idx < count; idx++) { @@ -73,76 +76,200 @@ struct upper_lower_fn { return bytes; } - __device__ void operator()(size_type idx) + // this is called for converting any UTF-8 characters + __device__ size_type process_character(char_utf8 chr, char* d_buffer = nullptr) const + { + auto const code_point = detail::utf8_to_codepoint(chr); + + detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; + + // we apply special mapping in two cases: + // - uncased characters with the special mapping flag: always + // - cased characters with the special mapping flag: when matching the input case_flag + if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) { + return handle_special_case_bytes(code_point, case_flag, d_buffer); + } + + char_utf8 const new_char = + (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : chr; + return (d_buffer) ? detail::from_char_utf8(new_char, d_buffer) + : detail::bytes_in_char_utf8(new_char); + } + + // special function for converting ASCII-only characters + __device__ char process_ascii(char chr) { - if (d_column.is_null(idx)) { + return (case_flag & d_flags[chr]) ? static_cast(d_case_table[chr]) : chr; + } +}; + +/** + * @brief Per string logic for case conversion functions + * + * This can be used in calls to make_strings_children. + */ +struct upper_lower_fn { + convert_char_fn converter; + column_device_view d_strings; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { if (!d_chars) d_offsets[idx] = 0; return; } - auto const d_str = d_column.template element(idx); - int32_t bytes = 0; + auto const d_str = d_strings.element(idx); + size_type bytes = 0; char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - uint32_t code_point = detail::utf8_to_codepoint(*itr); - - detail::character_flags_table_type flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0; - - // we apply special mapping in two cases: - // - uncased characters with the special mapping flag, always - // - cased characters with the special mapping flag, when matching the input case_flag - // - if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) { - auto const new_bytes = handle_special_case_bytes(code_point, d_buffer, case_flag); - bytes += new_bytes; - if (d_buffer) d_buffer += new_bytes; + auto const size = converter.process_character(*itr, d_buffer); + if (d_buffer) { + d_buffer += size; } else { - char_utf8 new_char = - (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : *itr; - if (!d_buffer) - bytes += detail::bytes_in_char_utf8(new_char); - else - d_buffer += detail::from_char_utf8(new_char, d_buffer); + bytes += size; } } - if (!d_buffer) d_offsets[idx] = bytes; + if (!d_buffer) { d_offsets[idx] = bytes; } + } +}; + +/** + * @brief Count output bytes in warp-parallel threads + * + * This executes as one warp per string and just computes the output sizes. + */ +struct count_bytes_fn { + convert_char_fn converter; + column_device_view d_strings; + size_type* d_offsets; + + __device__ void operator()(size_type idx) const + { + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + // initialize the output for the atomicAdd + if (lane_idx == 0) { d_offsets[str_idx] = 0; } + __syncwarp(); + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + auto const str_ptr = d_str.data(); + + size_type size = 0; + for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { + auto const chr = str_ptr[i]; + if (is_utf8_continuation_char(chr)) { continue; } + char_utf8 u8 = 0; + to_char_utf8(str_ptr + i, u8); + size += converter.process_character(u8); + } + // this is every so slightly faster than using the cub::warp_reduce + if (size > 0) atomicAdd(d_offsets + str_idx, size); } }; +/** + * @brief Special functor for processing ASCII-only data + */ +struct ascii_converter_fn { + convert_char_fn converter; + __device__ char operator()(char chr) { return converter.process_ascii(chr); } +}; + /** * @brief Utility method for converting upper and lower case characters - * in a strings column. + * in a strings column * - * @param strings Strings to convert. + * @param input Strings to convert * @param case_flag The character type to convert (upper, lower, or both) - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with characters converted. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with characters converted */ -std::unique_ptr convert_case(strings_column_view const& strings, +std::unique_ptr convert_case(strings_column_view const& input, character_flags_table_type case_flag, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - - // build functor with lookup tables used for case conversion - upper_lower_fn functor{d_column, - case_flag, - get_character_flags_table(), - get_character_cases_table(), - get_special_case_mapping_table()}; - - // this utility calls the functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(functor, strings.size(), stream, mr); - - return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + if (input.size() == input.null_count()) { + return std::make_unique(input.parent(), stream, mr); + } + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_flags = get_character_flags_table(); + auto const d_cases = get_character_cases_table(); + auto const d_special = get_special_case_mapping_table(); + + convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special}; + upper_lower_fn converter{ccfn, *d_strings}; + + // For smaller strings, use the regular string-parallel algorithm + if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + auto [offsets, chars] = + cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); + return make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); + } + + // Check if the input contains any multi-byte characters. + // This check incurs ~20% performance hit for smaller strings and so we only use it + // after the threshold check above. The check makes very little impact for larger strings + // but results in a large performance gain when the input contains only single-byte characters. + // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016 + bool const multi_byte_chars = + thrust::count_if( + rmm::exec_policy(stream), input.chars_begin(), input.chars_end(), [] __device__(auto chr) { + return is_utf8_continuation_char(chr); + }) > 0; + if (!multi_byte_chars) { + // optimization for ASCII-only case: copy the input column and inplace replace each character + auto result = std::make_unique(input.parent(), stream, mr); + auto d_chars = + result->mutable_view().child(strings_column_view::chars_column_index).data(); + auto const chars_size = strings_column_view(result->view()).chars_size(); + thrust::transform( + rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn}); + result->set_null_count(input.null_count()); + return result; + } + + // This will use a warp-parallel algorithm to compute the output sizes for each string + // and then uses the normal string parallel functor to build the output. + auto offsets = make_numeric_column( + data_type{type_to_id()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // first pass, compute output sizes + // note: tried to use segmented-reduce approach instead here and it was consistently slower + count_bytes_fn counter{ccfn, *d_strings, d_offsets}; + auto const count_itr = thrust::make_counting_iterator(0); + thrust::for_each_n( + rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter); + + // convert sizes to offsets + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), + "Size of output exceeds column size limit", + std::overflow_error); + + auto chars = create_chars_child_column(static_cast(bytes), stream, mr); + // second pass, write output + converter.d_offsets = d_offsets; + converter.d_chars = chars->mutable_view().data(); + thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); + + return make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } } // namespace diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp index 3852930dafe..31637a6ab9a 100644 --- a/cpp/tests/strings/case_tests.cpp +++ b/cpp/tests/strings/case_tests.cpp @@ -202,6 +202,64 @@ TEST_F(StringsCaseTest, MultiCharLower) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsCaseTest, Ascii) +{ + // triggering the ascii code path requires some long-ish strings + cudf::test::strings_column_wrapper input{ + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto view = cudf::strings_column_view(input); + auto expected = cudf::test::strings_column_wrapper{ + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto results = cudf::strings::to_lower(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::strings_column_wrapper{ + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"}; + results = cudf::strings::to_upper(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front()); +} + +TEST_F(StringsCaseTest, LongStrings) +{ + // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu + cudf::test::strings_column_wrapper input{ + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto view = cudf::strings_column_view(input); + auto expected = cudf::test::strings_column_wrapper{ + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; + auto results = cudf::strings::to_lower(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::strings_column_wrapper{ + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", + "ABCDÉFGHIJKLMNOPQRSTUVWXYZABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=-"}; + results = cudf::strings::to_upper(view); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front()); +} + TEST_F(StringsCaseTest, EmptyStringsColumn) { cudf::column_view zero_size_strings_column(