diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 46f2c0e7bc9..e3d667f0292 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,21 +62,19 @@ std::unique_ptr generate_ngrams( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Generates ngrams of characters within each string. + * @brief Generates ngrams of characters within each string * - * Each character of a string used to build ngrams. + * Each character of a string is used to build ngrams for the output row. * Ngrams are not created across strings. * * ``` - * ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"] + * ["ab", "cde", "fgh"] would generate bigrams as + * [["ab"], ["cd", "de"], ["fg", "gh"]] * ``` * - * The size of the output column will be the total number of ngrams generated from - * the input strings column. - * - * All null row entries are ignored and the output contains all valid rows. + * All null row entries are ignored and the corresponding output row will be empty. * - * @throw cudf::logic_error if `ngrams < 2` + * @throw std::invalid_argument if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * * @param input Strings column to produce ngrams from @@ -84,7 +82,7 @@ std::unique_ptr generate_ngrams( * Default is 2 = bigram. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of tokens + * @return Lists column of strings */ std::unique_ptr generate_character_ngrams( cudf::strings_column_view const& input, diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 3290b58101d..d2a0ef71e4a 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -40,6 +40,8 @@ #include #include +#include + namespace nvtext { namespace detail { namespace { @@ -90,9 +92,12 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid"); + CUDF_EXPECTS( + separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument); cudf::string_view const d_separator(separator.data(), separator.size()); - CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams > 1, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); auto strings_count = strings.size(); if (strings_count == 0) // if no strings, return an empty column @@ -196,47 +201,45 @@ struct character_ngram_generator_fn { }; } // namespace -std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, +std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); - auto const strings_count = strings.size(); - if (strings_count == 0) // if no strings, return an empty column + auto const strings_count = input.size(); + if (strings_count == 0) { // if no strings, return an empty column return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + } - auto const strings_column = cudf::column_device_view::create(strings.parent(), stream); - auto const d_strings = *strings_column; + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - // create a vector of ngram offsets for each string - rmm::device_uvector ngram_offsets(strings_count + 1, stream); - thrust::transform_exclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count + 1), - ngram_offsets.begin(), + auto sizes_itr = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type( - [d_strings, strings_count, ngrams] __device__(auto idx) { - if (d_strings.is_null(idx) || (idx == strings_count)) return 0; + [d_strings = *d_strings, ngrams] __device__(auto idx) { + if (d_strings.is_null(idx)) { return 0; } auto const length = d_strings.element(idx).length(); return std::max(0, static_cast(length + 1 - ngrams)); - }), - cudf::size_type{0}, - thrust::plus()); - - // total ngrams count is the last entry - cudf::size_type const total_ngrams = ngram_offsets.back_element(stream); + })); + auto [offsets, total_ngrams] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + auto d_offsets = offsets->view().data(); CUDF_EXPECTS(total_ngrams > 0, "Insufficient number of characters in each string to generate ngrams"); - character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()}; + character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( generator, strings_count, total_ngrams, stream, mr); - return cudf::make_strings_column( + auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); + + return make_lists_column( + input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr); } namespace { @@ -277,7 +280,9 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c5a5a342471..1acb4fc4265 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -50,29 +51,24 @@ TEST_F(TextGenerateNgramsTest, Ngrams) auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } + using LCW = cudf::test::lists_column_wrapper; { - cudf::test::strings_column_wrapper expected{"th", - "he", - "fo", - "ox", - "ju", - "um", - "mp", - "pe", - "ed", - "ov", - "ve", - "er", - "th", - "hé", - "do", - "og"}; + LCW expected({LCW({"th", "he"}), + LCW({"fo", "ox"}), + LCW({"ju", "um", "mp", "pe", "ed"}), + LCW({"ov", "ve", "er"}), + LCW({"th", "hé"}), + LCW({"do", "og"})}); auto const results = nvtext::generate_character_ngrams(strings_view, 2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::strings_column_wrapper expected{ - "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"}; + LCW expected({LCW({"the"}), + LCW({"fox"}), + LCW({"jum", "ump", "mpe", "ped"}), + LCW({"ove", "ver"}), + LCW({"thé"}), + LCW({"dog"})}); auto const results = nvtext::generate_character_ngrams(strings_view, 3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -80,24 +76,29 @@ TEST_F(TextGenerateNgramsTest, Ngrams) TEST_F(TextGenerateNgramsTest, NgramsWithNulls) { - std::vector h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"}, + validity); auto const separator = cudf::string_scalar("_"); - cudf::strings_column_view strings_view(strings); + cudf::strings_column_view sv(input); { - auto const results = nvtext::generate_ngrams(strings_view, 3, separator); + auto const results = nvtext::generate_ngrams(sv, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::strings_column_wrapper expected{ - "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"}; - auto const results = nvtext::generate_character_ngrams(strings_view, 3); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW({"the"}), + LCW({"fox"}), + LCW{}, + LCW({"jum", "ump", "mpe", "ped"}), + LCW({"ove", "ver"}), + LCW{}, + LCW({"the"}), + LCW({"dog"})}); + auto const results = nvtext::generate_character_ngrams(sv, 3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors) auto const separator = cudf::string_scalar("_"); // invalid parameter value EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), - cudf::logic_error); + std::invalid_argument); + auto const invalid_separator = cudf::string_scalar("", false); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator), + std::invalid_argument); // not enough strings to generate ngrams EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), cudf::logic_error); @@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors) auto view = cudf::strings_column_view(input); // invalid parameter value - EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error); + EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument); // strings not long enough to generate ngrams EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06d7aa030db..0862995bc46 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4830,27 +4830,14 @@ def character_ngrams( 2 [xyz] dtype: list """ - ngrams = libstrings.generate_character_ngrams(self._column, n) - - # convert the output to a list by just generating the - # offsets for the output list column - sn = (self.len() - (n - 1)).clip(0, None).fillna(0) # type: ignore - sizes = libcudf.concat.concat_columns( - [column.as_column(0, dtype=np.int32, length=1), sn._column] - ) - oc = libcudf.reduce.scan("cumsum", sizes, True) - lc = cudf.core.column.ListColumn( - size=self._column.size, - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, - offset=0, - null_count=self._column.null_count, - children=(oc, ngrams), + result = self._return_or_inplace( + libstrings.generate_character_ngrams(self._column, n), + retain_index=True, ) - result = self._return_or_inplace(lc, retain_index=True) - if isinstance(result, cudf.Series) and not as_list: - return result.explode() + # before exploding, removes those lists which have 0 length + result = result[result.list.len() > 0] + return result.explode() # type: ignore return result def hash_character_ngrams( diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 2dccd583b23..6ecead862bb 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import random import string @@ -330,9 +330,8 @@ def test_ngrams(n, separator, expected_values): "he", "er", "re", - cudf.NA, ], - [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5], False, ), ( @@ -340,15 +339,12 @@ def test_ngrams(n, separator, expected_values): [ "thi", "his", - cudf.NA, - cudf.NA, "boo", "ook", "her", "ere", - cudf.NA, ], - [1, 1, 2, 3, 4, 4, 5, 5, 6], + [1, 1, 4, 4, 5, 5], False, ), (