Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove empty elements from exploded character-ngrams output #15371

Merged
merged 1 commit into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -62,29 +62,27 @@ std::unique_ptr<cudf::column> generate_ngrams(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Generates ngrams of characters within each string.
* @brief Generates ngrams of characters within each string
*
* Each character of a string used to build ngrams.
* Each character of a string is used to build ngrams for the output row.
* Ngrams are not created across strings.
*
* ```
* ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"]
* ["ab", "cde", "fgh"] would generate bigrams as
* [["ab"], ["cd", "de"], ["fg", "gh"]]
* ```
*
* The size of the output column will be the total number of ngrams generated from
* the input strings column.
*
* All null row entries are ignored and the output contains all valid rows.
* All null row entries are ignored and the corresponding output row will be empty.
*
* @throw cudf::logic_error if `ngrams < 2`
* @throw std::invalid_argument if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
* @return Lists column of strings
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& input,
Expand Down
57 changes: 31 additions & 26 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_scan.h>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -90,9 +92,12 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
CUDF_EXPECTS(
separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
cudf::string_view const d_separator(separator.data(), separator.size());
CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams > 1,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto strings_count = strings.size();
if (strings_count == 0) // if no strings, return an empty column
Expand Down Expand Up @@ -196,47 +201,45 @@ struct character_ngram_generator_fn {
};
} // namespace

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams >= 2,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto const strings_count = strings.size();
if (strings_count == 0) // if no strings, return an empty column
auto const strings_count = input.size();
if (strings_count == 0) { // if no strings, return an empty column
return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
}

auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
auto const d_strings = *strings_column;
auto const d_strings = cudf::column_device_view::create(input.parent(), stream);

// create a vector of ngram offsets for each string
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
ngram_offsets.begin(),
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[d_strings, strings_count, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
[d_strings = *d_strings, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx)) { return 0; }
auto const length = d_strings.element<cudf::string_view>(idx).length();
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
}),
cudf::size_type{0},
thrust::plus<cudf::size_type>());

// total ngrams count is the last entry
cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
}));
auto [offsets, total_ngrams] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto d_offsets = offsets->view().data<cudf::size_type>();
CUDF_EXPECTS(total_ngrams > 0,
"Insufficient number of characters in each string to generate ngrams");

character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()};
character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
generator, strings_count, total_ngrams, stream, mr);

return cudf::make_strings_column(
auto output = cudf::make_strings_column(
total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});

return make_lists_column(
input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr);
}

namespace {
Expand Down Expand Up @@ -277,7 +280,9 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
CUDF_EXPECTS(ngrams >= 2,
"Parameter ngrams should be an integer value of 2 or greater",
std::invalid_argument);

auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }
Expand Down
66 changes: 35 additions & 31 deletions cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/testing_main.hpp>

#include <cudf/column/column.hpp>
Expand Down Expand Up @@ -50,54 +51,54 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
{
cudf::test::strings_column_wrapper expected{"th",
"he",
"fo",
"ox",
"ju",
"um",
"mp",
"pe",
"ed",
"ov",
"ve",
"er",
"th",
"hé",
"do",
"og"};
LCW expected({LCW({"th", "he"}),
LCW({"fo", "ox"}),
LCW({"ju", "um", "mp", "pe", "ed"}),
LCW({"ov", "ve", "er"}),
LCW({"th", "hé"}),
LCW({"do", "og"})});
auto const results = nvtext::generate_character_ngrams(strings_view, 2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper expected{
"the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"};
LCW expected({LCW({"the"}),
LCW({"fox"}),
LCW({"jum", "ump", "mpe", "ped"}),
LCW({"ove", "ver"}),
LCW({"thé"}),
LCW({"dog"})});
auto const results = nvtext::generate_character_ngrams(strings_view, 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}

TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
{
std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
auto validity = cudf::test::iterators::null_at(5);
cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"},
validity);
auto const separator = cudf::string_scalar("_");

cudf::strings_column_view strings_view(strings);
cudf::strings_column_view sv(input);
{
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
auto const results = nvtext::generate_ngrams(sv, 3, separator);
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
cudf::test::strings_column_wrapper expected{
"the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"};
auto const results = nvtext::generate_character_ngrams(strings_view, 3);
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW expected({LCW({"the"}),
LCW({"fox"}),
LCW{},
LCW({"jum", "ump", "mpe", "ped"}),
LCW({"ove", "ver"}),
LCW{},
LCW({"the"}),
LCW({"dog"})});
auto const results = nvtext::generate_character_ngrams(sv, 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}
Expand All @@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors)
auto const separator = cudf::string_scalar("_");
// invalid parameter value
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
cudf::logic_error);
std::invalid_argument);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
cudf::logic_error);
std::invalid_argument);
auto const invalid_separator = cudf::string_scalar("", false);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator),
std::invalid_argument);
// not enough strings to generate ngrams
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
cudf::logic_error);
Expand Down Expand Up @@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
auto view = cudf::strings_column_view(input);

// invalid parameter value
EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument);
// strings not long enough to generate ngrams
EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
}
Expand Down
25 changes: 6 additions & 19 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4832,27 +4832,14 @@ def character_ngrams(
2 [xyz]
dtype: list
"""
ngrams = libstrings.generate_character_ngrams(self._column, n)

# convert the output to a list by just generating the
# offsets for the output list column
sn = (self.len() - (n - 1)).clip(0, None).fillna(0) # type: ignore
sizes = libcudf.concat.concat_columns(
[column.as_column(0, dtype=np.int32, length=1), sn._column]
)
oc = libcudf.reduce.scan("cumsum", sizes, True)
lc = cudf.core.column.ListColumn(
size=self._column.size,
dtype=cudf.ListDtype(self._column.dtype),
mask=self._column.mask,
offset=0,
null_count=self._column.null_count,
children=(oc, ngrams),
result = self._return_or_inplace(
libstrings.generate_character_ngrams(self._column, n),
retain_index=True,
)
result = self._return_or_inplace(lc, retain_index=True)

if isinstance(result, cudf.Series) and not as_list:
return result.explode()
# before exploding, removes those lists which have 0 length
result = result[result.list.len() > 0]
return result.explode() # type: ignore
return result

def hash_character_ngrams(
Expand Down
10 changes: 3 additions & 7 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import random
import string
Expand Down Expand Up @@ -330,25 +330,21 @@ def test_ngrams(n, separator, expected_values):
"he",
"er",
"re",
cudf.NA,
],
[1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
[1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5],
False,
),
(
3,
[
"thi",
"his",
cudf.NA,
cudf.NA,
"boo",
"ook",
"her",
"ere",
cudf.NA,
],
[1, 1, 2, 3, 4, 4, 5, 5, 6],
[1, 1, 4, 4, 5, 5],
False,
),
(
Expand Down
Loading