Skip to content

Commit

Permalink
Remove empty elements from exploded character-ngrams output (#15371)
Browse files Browse the repository at this point in the history
Fixes `character_ngrams` function to not include empty entries when `as_list=False`. That is, the exploded view (non-list result) should not contain empty or NA elements.

This PR changes the `nvtext::generate_character_ngrams()` API to return a lists column instead of a flat strings column. The python code had been converting the return object into lists column and then exploding it if `as_list=False`. Returning as a list column simplifies the logic and prevents the double conversion. There is almost no impact to the nvtext code since the offsets for the output lists column were already being generated.

All tests were updated to expect the new result. Also changed some exception types from `cudf::logic_error` to `std::invalid_argument` as appropriate.

Continues work of abandoned PR #14685

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15371
  • Loading branch information
davidwendt authored Apr 4, 2024
1 parent 43994fa commit d7b8fc4
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 93 deletions.
18 changes: 8 additions & 10 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -62,29 +62,27 @@ std::unique_ptr<cudf::column> generate_ngrams(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Generates ngrams of characters within each string.
* @brief Generates ngrams of characters within each string
*
* Each character of a string used to build ngrams.
* Each character of a string is used to build ngrams for the output row.
* Ngrams are not created across strings.
*
* ```
* ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"]
* ["ab", "cde", "fgh"] would generate bigrams as
* [["ab"], ["cd", "de"], ["fg", "gh"]]
* ```
*
* The size of the output column will be the total number of ngrams generated from
* the input strings column.
*
* All null row entries are ignored and the output contains all valid rows.
* All null row entries are ignored and the corresponding output row will be empty.
*
* @throw cudf::logic_error if `ngrams < 2`
* @throw std::invalid_argument if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
* @return Lists column of strings
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& input,
Expand Down
57 changes: 31 additions & 26 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_scan.h>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -90,9 +92,12 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
CUDF_EXPECTS(
separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
cudf::string_view const d_separator(separator.data(), separator.size());
CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams > 1,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto strings_count = strings.size();
if (strings_count == 0) // if no strings, return an empty column
Expand Down Expand Up @@ -196,47 +201,45 @@ struct character_ngram_generator_fn {
};
} // namespace

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams >= 2,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto const strings_count = strings.size();
if (strings_count == 0) // if no strings, return an empty column
auto const strings_count = input.size();
if (strings_count == 0) { // if no strings, return an empty column
return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
}

auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
auto const d_strings = *strings_column;
auto const d_strings = cudf::column_device_view::create(input.parent(), stream);

// create a vector of ngram offsets for each string
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
ngram_offsets.begin(),
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[d_strings, strings_count, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
[d_strings = *d_strings, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx)) { return 0; }
auto const length = d_strings.element<cudf::string_view>(idx).length();
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
}),
cudf::size_type{0},
thrust::plus<cudf::size_type>());

// total ngrams count is the last entry
cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
}));
auto [offsets, total_ngrams] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto d_offsets = offsets->view().data<cudf::size_type>();
CUDF_EXPECTS(total_ngrams > 0,
"Insufficient number of characters in each string to generate ngrams");

character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()};
character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
generator, strings_count, total_ngrams, stream, mr);

return cudf::make_strings_column(
auto output = cudf::make_strings_column(
total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});

return make_lists_column(
input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr);
}

namespace {
Expand Down Expand Up @@ -277,7 +280,9 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams >= 2,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }
Expand Down
66 changes: 35 additions & 31 deletions cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/testing_main.hpp>

#include <cudf/column/column.hpp>
Expand Down Expand Up @@ -50,54 +51,54 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
{
cudf::test::strings_column_wrapper expected{"th",
"he",
"fo",
"ox",
"ju",
"um",
"mp",
"pe",
"ed",
"ov",
"ve",
"er",
"th",
"",
"do",
"og"};
LCW expected({LCW({"th", "he"}),
LCW({"fo", "ox"}),
LCW({"ju", "um", "mp", "pe", "ed"}),
LCW({"ov", "ve", "er"}),
LCW({"th", ""}),
LCW({"do", "og"})});
auto const results = nvtext::generate_character_ngrams(strings_view, 2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper expected{
"the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"};
LCW expected({LCW({"the"}),
LCW({"fox"}),
LCW({"jum", "ump", "mpe", "ped"}),
LCW({"ove", "ver"}),
LCW({"thé"}),
LCW({"dog"})});
auto const results = nvtext::generate_character_ngrams(strings_view, 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}

TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
{
std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
auto validity = cudf::test::iterators::null_at(5);
cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"},
validity);
auto const separator = cudf::string_scalar("_");

cudf::strings_column_view strings_view(strings);
cudf::strings_column_view sv(input);
{
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
auto const results = nvtext::generate_ngrams(sv, 3, separator);
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper expected{
"the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"};
auto const results = nvtext::generate_character_ngrams(strings_view, 3);
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW expected({LCW({"the"}),
LCW({"fox"}),
LCW{},
LCW({"jum", "ump", "mpe", "ped"}),
LCW({"ove", "ver"}),
LCW{},
LCW({"the"}),
LCW({"dog"})});
auto const results = nvtext::generate_character_ngrams(sv, 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}
Expand All @@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors)
auto const separator = cudf::string_scalar("_");
// invalid parameter value
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
cudf::logic_error);
std::invalid_argument);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
cudf::logic_error);
std::invalid_argument);
auto const invalid_separator = cudf::string_scalar("", false);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator),
std::invalid_argument);
// not enough strings to generate ngrams
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
cudf::logic_error);
Expand Down Expand Up @@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
auto view = cudf::strings_column_view(input);

// invalid parameter value
EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument);
// strings not long enough to generate ngrams
EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
}
Expand Down
25 changes: 6 additions & 19 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4830,27 +4830,14 @@ def character_ngrams(
2 [xyz]
dtype: list
"""
ngrams = libstrings.generate_character_ngrams(self._column, n)

# convert the output to a list by just generating the
# offsets for the output list column
sn = (self.len() - (n - 1)).clip(0, None).fillna(0) # type: ignore
sizes = libcudf.concat.concat_columns(
[column.as_column(0, dtype=np.int32, length=1), sn._column]
)
oc = libcudf.reduce.scan("cumsum", sizes, True)
lc = cudf.core.column.ListColumn(
size=self._column.size,
dtype=cudf.ListDtype(self._column.dtype),
mask=self._column.mask,
offset=0,
null_count=self._column.null_count,
children=(oc, ngrams),
result = self._return_or_inplace(
libstrings.generate_character_ngrams(self._column, n),
retain_index=True,
)
result = self._return_or_inplace(lc, retain_index=True)

if isinstance(result, cudf.Series) and not as_list:
return result.explode()
# before exploding, removes those lists which have 0 length
result = result[result.list.len() > 0]
return result.explode() # type: ignore
return result

def hash_character_ngrams(
Expand Down
10 changes: 3 additions & 7 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import random
import string
Expand Down Expand Up @@ -330,25 +330,21 @@ def test_ngrams(n, separator, expected_values):
"he",
"er",
"re",
cudf.NA,
],
[1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
[1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5],
False,
),
(
3,
[
"thi",
"his",
cudf.NA,
cudf.NA,
"boo",
"ook",
"her",
"ere",
cudf.NA,
],
[1, 1, 2, 3, 4, 4, 5, 5, 6],
[1, 1, 4, 4, 5, 5],
False,
),
(
Expand Down

0 comments on commit d7b8fc4

Please sign in to comment.