From d7b8fc4de4107b6ee95cdeb26e7efecd3adf9325 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:50:23 -0400 Subject: [PATCH] Remove empty elements from exploded character-ngrams output (#15371) Fixes `character_ngrams` function to not include empty entries when `as_list=False`. That is, the exploded view (non-list result) should not contain empty or NA elements. This PR changes the `nvtext::generate_character_ngrams()` API to return a lists column instead of a flat strings column. The python code had been converting the return object into lists column and then exploding it if `as_list=False`. Returning as a list column simplifies the logic and prevents the double conversion. There is almost no impact to the nvtext code since the offsets for the output lists column were already being generated. All tests were updated to expect the new result. Also changed some exception types from `cudf::logic_error` to `std::invalid_argument` as appropriate. Continues work of abandoned PR #14685 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15371 --- cpp/include/nvtext/generate_ngrams.hpp | 18 +++-- cpp/src/text/generate_ngrams.cu | 57 ++++++++-------- cpp/tests/text/ngrams_tests.cpp | 66 ++++++++++--------- python/cudf/cudf/core/column/string.py | 25 ++----- .../cudf/cudf/tests/text/test_text_methods.py | 10 +-- 5 files changed, 83 insertions(+), 93 deletions(-) diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 46f2c0e7bc9..e3d667f0292 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,21 +62,19 @@ std::unique_ptr generate_ngrams( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Generates ngrams of characters within each string. + * @brief Generates ngrams of characters within each string * - * Each character of a string used to build ngrams. + * Each character of a string is used to build ngrams for the output row. * Ngrams are not created across strings. * * ``` - * ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"] + * ["ab", "cde", "fgh"] would generate bigrams as + * [["ab"], ["cd", "de"], ["fg", "gh"]] * ``` * - * The size of the output column will be the total number of ngrams generated from - * the input strings column. - * - * All null row entries are ignored and the output contains all valid rows. + * All null row entries are ignored and the corresponding output row will be empty. * - * @throw cudf::logic_error if `ngrams < 2` + * @throw std::invalid_argument if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * * @param input Strings column to produce ngrams from @@ -84,7 +82,7 @@ std::unique_ptr generate_ngrams( * Default is 2 = bigram. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of tokens + * @return Lists column of strings */ std::unique_ptr generate_character_ngrams( cudf::strings_column_view const& input, diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 3290b58101d..d2a0ef71e4a 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -40,6 +40,8 @@ #include #include +#include + namespace nvtext { namespace detail { namespace { @@ -90,9 +92,12 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid"); + CUDF_EXPECTS( + separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument); cudf::string_view const d_separator(separator.data(), separator.size()); - CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams > 1, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); auto strings_count = strings.size(); if (strings_count == 0) // if no strings, return an empty column @@ -196,47 +201,45 @@ struct character_ngram_generator_fn { }; } // namespace -std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, +std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); - auto const strings_count = strings.size(); - if (strings_count == 0) // if no strings, return an empty column + auto const strings_count = input.size(); + if (strings_count == 0) { // if no strings, return an empty column return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + } - auto const strings_column = cudf::column_device_view::create(strings.parent(), stream); - auto const d_strings = *strings_column; + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - // create a vector of ngram offsets for each string - rmm::device_uvector ngram_offsets(strings_count + 1, stream); - thrust::transform_exclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count + 1), - ngram_offsets.begin(), + auto sizes_itr = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type( - [d_strings, strings_count, ngrams] __device__(auto idx) { - if (d_strings.is_null(idx) || (idx == strings_count)) return 0; + [d_strings = *d_strings, ngrams] __device__(auto idx) { + if (d_strings.is_null(idx)) { return 0; } auto const length = d_strings.element(idx).length(); return std::max(0, static_cast(length + 1 - ngrams)); - }), - cudf::size_type{0}, - thrust::plus()); - - // total ngrams count is the last entry - cudf::size_type const total_ngrams = ngram_offsets.back_element(stream); + })); + auto [offsets, total_ngrams] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + auto d_offsets = offsets->view().data(); CUDF_EXPECTS(total_ngrams > 0, "Insufficient number of characters in each string to generate ngrams"); - character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()}; + character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( generator, strings_count, total_ngrams, stream, mr); - return cudf::make_strings_column( + auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); + + return make_lists_column( + input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr); } namespace { @@ -277,7 +280,9 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater"); + CUDF_EXPECTS(ngrams >= 2, + "Parameter ngrams should be an integer value of 2 or greater", + std::invalid_argument); auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c5a5a342471..1acb4fc4265 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -50,29 +51,24 @@ TEST_F(TextGenerateNgramsTest, Ngrams) auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } + using LCW = cudf::test::lists_column_wrapper; { - cudf::test::strings_column_wrapper expected{"th", - "he", - "fo", - "ox", - "ju", - "um", - "mp", - "pe", - "ed", - "ov", - "ve", - "er", - "th", - "hé", - "do", - "og"}; + LCW expected({LCW({"th", "he"}), + LCW({"fo", "ox"}), + LCW({"ju", "um", "mp", "pe", "ed"}), + LCW({"ov", "ve", "er"}), + LCW({"th", "hé"}), + LCW({"do", "og"})}); auto const results = nvtext::generate_character_ngrams(strings_view, 2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::strings_column_wrapper expected{ - "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"}; + LCW expected({LCW({"the"}), + LCW({"fox"}), + LCW({"jum", "ump", "mpe", "ped"}), + LCW({"ove", "ver"}), + LCW({"thé"}), + LCW({"dog"})}); auto const results = nvtext::generate_character_ngrams(strings_view, 3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -80,24 +76,29 @@ TEST_F(TextGenerateNgramsTest, Ngrams) TEST_F(TextGenerateNgramsTest, NgramsWithNulls) { - std::vector h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"}, + validity); auto const separator = cudf::string_scalar("_"); - cudf::strings_column_view strings_view(strings); + cudf::strings_column_view sv(input); { - auto const results = nvtext::generate_ngrams(strings_view, 3, separator); + auto const results = nvtext::generate_ngrams(sv, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::strings_column_wrapper expected{ - "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"}; - auto const results = nvtext::generate_character_ngrams(strings_view, 3); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW({"the"}), + LCW({"fox"}), + LCW{}, + LCW({"jum", "ump", "mpe", "ped"}), + LCW({"ove", "ver"}), + LCW{}, + LCW({"the"}), + LCW({"dog"})}); + auto const results = nvtext::generate_character_ngrams(sv, 3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } @@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors) auto const separator = cudf::string_scalar("_"); // invalid parameter value EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), - cudf::logic_error); + std::invalid_argument); + auto const invalid_separator = cudf::string_scalar("", false); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator), + std::invalid_argument); // not enough strings to generate ngrams EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), cudf::logic_error); @@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors) auto view = cudf::strings_column_view(input); // invalid parameter value - EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error); + EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument); // strings not long enough to generate ngrams EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06d7aa030db..0862995bc46 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4830,27 +4830,14 @@ def character_ngrams( 2 [xyz] dtype: list """ - ngrams = libstrings.generate_character_ngrams(self._column, n) - - # convert the output to a list by just generating the - # offsets for the output list column - sn = (self.len() - (n - 1)).clip(0, None).fillna(0) # type: ignore - sizes = libcudf.concat.concat_columns( - [column.as_column(0, dtype=np.int32, length=1), sn._column] - ) - oc = libcudf.reduce.scan("cumsum", sizes, True) - lc = cudf.core.column.ListColumn( - size=self._column.size, - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, - offset=0, - null_count=self._column.null_count, - children=(oc, ngrams), + result = self._return_or_inplace( + libstrings.generate_character_ngrams(self._column, n), + retain_index=True, ) - result = self._return_or_inplace(lc, retain_index=True) - if isinstance(result, cudf.Series) and not as_list: - return result.explode() + # before exploding, removes those lists which have 0 length + result = result[result.list.len() > 0] + return result.explode() # type: ignore return result def hash_character_ngrams( diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 2dccd583b23..6ecead862bb 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import random import string @@ -330,9 +330,8 @@ def test_ngrams(n, separator, expected_values): "he", "er", "re", - cudf.NA, ], - [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5], False, ), ( @@ -340,15 +339,12 @@ def test_ngrams(n, separator, expected_values): [ "thi", "his", - cudf.NA, - cudf.NA, "boo", "ook", "her", "ere", - cudf.NA, ], - [1, 1, 2, 3, 4, 4, 5, 5, 6], + [1, 1, 4, 4, 5, 5], False, ), (