Skip to content

Commit

Permalink
Migrate nvtext generate_ngrams APIs to pylibcudf (#17006)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #17006
  • Loading branch information
Matt711 authored Oct 8, 2024
1 parent 2d02bdc commit 09ed210
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 62 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
===============
generate_ngrams
===============

.. automodule:: pylibcudf.nvtext.generate_ngrams
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ nvtext
:maxdepth: 1

edit_distance
generate_ngrams
77 changes: 18 additions & 59 deletions python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,75 +2,34 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
hash_character_ngrams as cpp_hash_character_ngrams,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf import nvtext


@acquire_spill_lock()
def generate_ngrams(Column strings, int ngrams, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef const string_scalar* c_separator = <const string_scalar*>separator\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_ngrams(
c_strings,
c_ngrams,
c_separator[0]
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.generate_ngrams(
strings.to_pylibcudf(mode="read"),
ngrams,
py_separator.device_value.c_value
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def generate_character_ngrams(Column strings, int ngrams):
cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_character_ngrams(
c_strings,
c_ngrams
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.generate_character_ngrams(
strings.to_pylibcudf(mode="read"),
ngrams
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def hash_character_ngrams(Column strings, int ngrams):
cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_hash_character_ngrams(
c_strings,
c_ngrams
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.hash_character_ngrams(
strings.to_pylibcudf(mode="read"),
ngrams
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance
from . cimport edit_distance, generate_ngrams

__all__ = [
"edit_distance",
"generate_ngrams",
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance
from . import edit_distance, generate_ngrams

__all__ = [
"edit_distance",
"generate_ngrams",
]
12 changes: 12 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)

cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)

cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
111 changes: 111 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
hash_character_ngrams as cpp_hash_character_ngrams,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
"""
Returns a single column of strings by generating ngrams from a strings column.
For details, see :cpp:func:`generate_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
separator : Scalar
The string to use for separating ngram tokens
Returns
-------
Column
New strings columns of tokens
"""
cdef column_view c_strings = input.view()
cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_ngrams(
c_strings,
ngrams,
c_separator[0]
)
)
return Column.from_libcudf(move(c_result))


cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
"""
Returns a lists column of ngrams of characters within each string.
For details, see :cpp:func:`generate_character_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
Returns
-------
Column
Lists column of strings
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_character_ngrams(
c_strings,
ngrams,
)
)
return Column.from_libcudf(move(c_result))

cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
"""
Returns a lists column of hash values of the characters in each string
For details, see :cpp:func:`hash_character_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
Returns
-------
Column
Lists column of hash values
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_hash_character_ngrams(
c_strings,
ngrams,
)
)
return Column.from_libcudf(move(c_result))
54 changes: 54 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["ab", "cde", "fgh"]
return pa.array(arr)


@pytest.mark.parametrize("ngram", [2, 3])
@pytest.mark.parametrize("sep", ["_", "**", ","])
def test_generate_ngrams(input_col, ngram, sep):
result = plc.nvtext.generate_ngrams.generate_ngrams(
plc.interop.from_arrow(input_col),
ngram,
plc.interop.from_arrow(pa.scalar(sep)),
)
expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
if ngram == 3:
expected = pa.array([f"ab{sep}cde{sep}fgh"])
assert_column_eq(result, expected)


@pytest.mark.parametrize("ngram", [2, 3])
def test_generate_character_ngrams(input_col, ngram):
result = plc.nvtext.generate_ngrams.generate_character_ngrams(
plc.interop.from_arrow(input_col),
ngram,
)
expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
if ngram == 3:
expected = pa.array([[], ["cde"], ["fgh"]])
assert_column_eq(result, expected)


@pytest.mark.parametrize("ngram", [2, 3])
def test_hash_character_ngrams(input_col, ngram):
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
plc.interop.from_arrow(input_col),
ngram,
)
pa_result = plc.interop.to_arrow(result)
assert all(
len(got) == max(0, len(s.as_py()) - ngram + 1)
for got, s in zip(pa_result, input_col)
)
assert pa_result.type == pa.list_(
pa.field("element", pa.uint32(), nullable=False)
)

0 comments on commit 09ed210

Please sign in to comment.