From ae12634c834a82d3d8884110c9de07d91877c828 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:51:28 -0400 Subject: [PATCH] Fix large strings handling in nvtext::character_tokenize (#15829) Fix logic for `nvtext::character_tokenize` to handle large strings input. The output for > 2GB input strings column will turn characters into rows and so will likely overflow the `size_type` rows as expected. The `thrust::count_if` is replaced with a raw kernel to produce the appropriate count that can be checked against max row size. Also changed the API to not accept null rows since the code does not check for them and can return invalid results for inputs with unsanitized-null rows. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15829 --- cpp/benchmarks/text/tokenize.cpp | 6 +- cpp/include/nvtext/tokenize.hpp | 3 +- cpp/src/text/tokenize.cu | 66 ++++++++++++++----- cpp/tests/text/tokenize_tests.cpp | 10 +-- python/cudf/cudf/core/column/string.py | 13 ++-- .../cudf/cudf/tests/text/test_text_methods.py | 2 - 6 files changed, 66 insertions(+), 34 deletions(-) diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 2151b28d637..e83310e0343 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -39,8 +39,10 @@ static void bench_tokenize(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index ea1b9c716f0..29fed0759c7 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -176,7 +176,8 @@ std::unique_ptr count_tokens( * t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"] * @endcode * - * All null row entries are ignored and the output contains all valid rows. + * @throw std::invalid_argument if `input` contains nulls + * @throw std::overflow_error if the output would produce more than max size_type rows * * @param input Strings column to tokenize * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 0b16305a81a..25406bce759 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include +#include #include #include #include @@ -99,6 +101,31 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr); } +constexpr int64_t block_size = 512; // number of threads per block +constexpr int64_t bytes_per_thread = 4; // bytes processed per thread + +CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + } // namespace // detail APIs @@ -176,11 +203,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto offsets = strings_column.offsets(); - auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); - auto chars_bytes = cudf::strings::detail::get_offset_value( - offsets, strings_column.offset() + strings_count, stream) - - offset; + CUDF_EXPECTS( + strings_column.null_count() == 0, "input must not contain nulls", std::invalid_argument); + + auto const offsets = strings_column.offsets(); + auto const offset = + cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); + auto const chars_bytes = cudf::strings::detail::get_offset_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; + // no bytes -- this could happen in an all-empty column + if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } auto d_chars = strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -188,23 +221,26 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - cudf::size_type num_characters = thrust::count_if( - rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { - return cudf::strings::detail::is_begin_utf8_char(byte); - }); + rmm::device_scalar d_count(0, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), + block_size); + count_characters<<>>( + d_chars, chars_bytes, d_count.data()); + auto const num_characters = d_count.value(stream); - // no characters check -- this could happen in all-empty or all-null strings column - if (num_characters == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - } + // number of characters becomes the number of rows so need to check the row limit + CUDF_EXPECTS( + num_characters + 1 < static_cast(std::numeric_limits::max()), + "output exceeds the column size limit", + std::overflow_error); // create output offsets column - // -- conditionally copy a counting iterator where - // the first byte of each character is located auto offsets_column = cudf::make_numeric_column( offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); + // offsets are at the beginning byte of each character cudf::detail::copy_if_safe( thrust::counting_iterator(0), thrust::counting_iterator(chars_bytes + 1), diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index 6a6bcda87cc..a59a54169d7 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -111,17 +111,13 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest) TEST_F(TextTokenizeTest, CharacterTokenize) { - std::vector h_strings{"the mousé ate the cheese", nullptr, ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""}); cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s", "é", " ", "a", "t", "e", " ", "t", "h", "e", " ", "c", "h", "e", "e", "s", "e"}; - auto results = nvtext::character_tokenize(cudf::strings_column_view(strings)); + auto results = nvtext::character_tokenize(cudf::strings_column_view(input)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -151,8 +147,6 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::character_tokenize(all_empty); EXPECT_EQ(results->size(), 0); - results = nvtext::character_tokenize(all_null); - EXPECT_EQ(results->size(), 0); auto const delimiter = cudf::string_scalar{""}; results = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter); EXPECT_EQ(results->size(), 0); diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d12aa80e9a3..ad7dbe5e52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -552,16 +552,17 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = libstrings.character_tokenize(self._column) + col = self._column.fillna("") # sanitize nulls + result_col = libstrings.character_tokenize(col) - offset_col = self._column.children[0] + offset_col = col.children[0] return cudf.core.column.ListColumn( - size=len(self._column), - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, + size=len(col), + dtype=cudf.ListDtype(col.dtype), + mask=col.mask, offset=0, - null_count=self._column.null_count, + null_count=0, children=(offset_col, result_col), ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6bd3b99bae1..36f7f3de828 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -426,7 +426,6 @@ def test_character_tokenize_series(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" @@ -543,7 +542,6 @@ def test_character_tokenize_index(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ"