-
Notifications
You must be signed in to change notification settings - Fork 917
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Migrate nvtext generate_ngrams APIs to pylibcudf (#17006)
Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17006
- Loading branch information
Showing
9 changed files
with
207 additions
and
62 deletions.
There are no files selected for viewing
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
=============== | ||
generate_ngrams | ||
=============== | ||
|
||
.. automodule:: pylibcudf.nvtext.generate_ngrams | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,4 @@ nvtext | |
:maxdepth: 1 | ||
|
||
edit_distance | ||
generate_ngrams |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . cimport edit_distance | ||
from . cimport edit_distance, generate_ngrams | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . import edit_distance | ||
from . import edit_distance, generate_ngrams | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
|
||
cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) | ||
|
||
cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) | ||
|
||
cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.column.column_view cimport column_view | ||
from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( | ||
generate_character_ngrams as cpp_generate_character_ngrams, | ||
generate_ngrams as cpp_generate_ngrams, | ||
hash_character_ngrams as cpp_hash_character_ngrams, | ||
) | ||
from pylibcudf.libcudf.scalar.scalar cimport string_scalar | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
|
||
cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): | ||
""" | ||
Returns a single column of strings by generating ngrams from a strings column. | ||
For details, see :cpp:func:`generate_ngrams` | ||
Parameters | ||
---------- | ||
input : Column | ||
Input strings | ||
ngram : size_type | ||
The ngram number to generate | ||
separator : Scalar | ||
The string to use for separating ngram tokens | ||
Returns | ||
------- | ||
Column | ||
New strings columns of tokens | ||
""" | ||
cdef column_view c_strings = input.view() | ||
cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get() | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_generate_ngrams( | ||
c_strings, | ||
ngrams, | ||
c_separator[0] | ||
) | ||
) | ||
return Column.from_libcudf(move(c_result)) | ||
|
||
|
||
cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): | ||
""" | ||
Returns a lists column of ngrams of characters within each string. | ||
For details, see :cpp:func:`generate_character_ngrams` | ||
Parameters | ||
---------- | ||
input : Column | ||
Input strings | ||
ngram : size_type | ||
The ngram number to generate | ||
Returns | ||
------- | ||
Column | ||
Lists column of strings | ||
""" | ||
cdef column_view c_strings = input.view() | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_generate_character_ngrams( | ||
c_strings, | ||
ngrams, | ||
) | ||
) | ||
return Column.from_libcudf(move(c_result)) | ||
|
||
cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): | ||
""" | ||
Returns a lists column of hash values of the characters in each string | ||
For details, see :cpp:func:`hash_character_ngrams` | ||
Parameters | ||
---------- | ||
input : Column | ||
Input strings | ||
ngram : size_type | ||
The ngram number to generate | ||
Returns | ||
------- | ||
Column | ||
Lists column of hash values | ||
""" | ||
cdef column_view c_strings = input.view() | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_hash_character_ngrams( | ||
c_strings, | ||
ngrams, | ||
) | ||
) | ||
return Column.from_libcudf(move(c_result)) |
54 changes: 54 additions & 0 deletions
54
python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pylibcudf as plc | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def input_col(): | ||
arr = ["ab", "cde", "fgh"] | ||
return pa.array(arr) | ||
|
||
|
||
@pytest.mark.parametrize("ngram", [2, 3]) | ||
@pytest.mark.parametrize("sep", ["_", "**", ","]) | ||
def test_generate_ngrams(input_col, ngram, sep): | ||
result = plc.nvtext.generate_ngrams.generate_ngrams( | ||
plc.interop.from_arrow(input_col), | ||
ngram, | ||
plc.interop.from_arrow(pa.scalar(sep)), | ||
) | ||
expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) | ||
if ngram == 3: | ||
expected = pa.array([f"ab{sep}cde{sep}fgh"]) | ||
assert_column_eq(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("ngram", [2, 3]) | ||
def test_generate_character_ngrams(input_col, ngram): | ||
result = plc.nvtext.generate_ngrams.generate_character_ngrams( | ||
plc.interop.from_arrow(input_col), | ||
ngram, | ||
) | ||
expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) | ||
if ngram == 3: | ||
expected = pa.array([[], ["cde"], ["fgh"]]) | ||
assert_column_eq(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("ngram", [2, 3]) | ||
def test_hash_character_ngrams(input_col, ngram): | ||
result = plc.nvtext.generate_ngrams.hash_character_ngrams( | ||
plc.interop.from_arrow(input_col), | ||
ngram, | ||
) | ||
pa_result = plc.interop.to_arrow(result) | ||
assert all( | ||
len(got) == max(0, len(s.as_py()) - ngram + 1) | ||
for got, s in zip(pa_result, input_col) | ||
) | ||
assert pa_result.type == pa.list_( | ||
pa.field("element", pa.uint32(), nullable=False) | ||
) |