diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 2151b28d637..e83310e0343 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -39,8 +39,10 @@ static void bench_tokenize(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index ea1b9c716f0..29fed0759c7 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -176,7 +176,8 @@ std::unique_ptr count_tokens( * t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"] * @endcode * - * All null row entries are ignored and the output contains all valid rows. + * @throw std::invalid_argument if `input` contains nulls + * @throw std::overflow_error if the output would produce more than max size_type rows * * @param input Strings column to tokenize * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 0b16305a81a..25406bce759 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include +#include #include #include #include @@ -99,6 +101,31 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr); } +constexpr int64_t block_size = 512; // number of threads per block +constexpr int64_t bytes_per_thread = 4; // bytes processed per thread + +CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + } // namespace // detail APIs @@ -176,11 +203,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto offsets = strings_column.offsets(); - auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); - auto chars_bytes = cudf::strings::detail::get_offset_value( - offsets, strings_column.offset() + strings_count, stream) - - offset; + CUDF_EXPECTS( + strings_column.null_count() == 0, "input must not contain nulls", std::invalid_argument); + + auto const offsets = strings_column.offsets(); + auto const offset = + cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); + auto const chars_bytes = cudf::strings::detail::get_offset_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; + // no bytes -- this could happen in an all-empty column + if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } auto d_chars = strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -188,23 +221,26 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - cudf::size_type num_characters = thrust::count_if( - rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { - return cudf::strings::detail::is_begin_utf8_char(byte); - }); + rmm::device_scalar d_count(0, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), + block_size); + count_characters<<>>( + d_chars, chars_bytes, d_count.data()); + auto const num_characters = d_count.value(stream); - // no characters check -- this could happen in all-empty or all-null strings column - if (num_characters == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - } + // number of characters becomes the number of rows so need to check the row limit + CUDF_EXPECTS( + num_characters + 1 < static_cast(std::numeric_limits::max()), + "output exceeds the column size limit", + std::overflow_error); // create output offsets column - // -- conditionally copy a counting iterator where - // the first byte of each character is located auto offsets_column = cudf::make_numeric_column( offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); + // offsets are at the beginning byte of each character cudf::detail::copy_if_safe( thrust::counting_iterator(0), thrust::counting_iterator(chars_bytes + 1), diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index 6a6bcda87cc..a59a54169d7 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -111,17 +111,13 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest) TEST_F(TextTokenizeTest, CharacterTokenize) { - std::vector h_strings{"the mousé ate the cheese", nullptr, ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""}); cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s", "é", " ", "a", "t", "e", " ", "t", "h", "e", " ", "c", "h", "e", "e", "s", "e"}; - auto results = nvtext::character_tokenize(cudf::strings_column_view(strings)); + auto results = nvtext::character_tokenize(cudf::strings_column_view(input)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -151,8 +147,6 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::character_tokenize(all_empty); EXPECT_EQ(results->size(), 0); - results = nvtext::character_tokenize(all_null); - EXPECT_EQ(results->size(), 0); auto const delimiter = cudf::string_scalar{""}; results = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter); EXPECT_EQ(results->size(), 0); diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d12aa80e9a3..ad7dbe5e52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -552,16 +552,17 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = libstrings.character_tokenize(self._column) + col = self._column.fillna("") # sanitize nulls + result_col = libstrings.character_tokenize(col) - offset_col = self._column.children[0] + offset_col = col.children[0] return cudf.core.column.ListColumn( - size=len(self._column), - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, + size=len(col), + dtype=cudf.ListDtype(col.dtype), + mask=col.mask, offset=0, - null_count=self._column.null_count, + null_count=0, children=(offset_col, result_col), ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6bd3b99bae1..36f7f3de828 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -426,7 +426,6 @@ def test_character_tokenize_series(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" @@ -543,7 +542,6 @@ def test_character_tokenize_index(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ"