Skip to content

Commit

Permalink
Remove nvtext::load_vocabulary from pylibcudf (#17220)
Browse files Browse the repository at this point in the history
This PR follow up #17100 to address the last review here #17100 (review)

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17220
  • Loading branch information
Matt711 authored Oct 31, 2024
1 parent f7020f1 commit 02a50e8
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 42 deletions.
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/tokenize_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

from __future__ import annotations

import pylibcudf as plc

import cudf
from cudf._lib.nvtext.tokenize import (
TokenizeVocabulary as cpp_tokenize_vocabulary,
tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
)

Expand All @@ -20,7 +21,7 @@ class TokenizeVocabulary:
"""

def __init__(self, vocabulary: "cudf.Series"):
self.vocabulary = cpp_tokenize_vocabulary(
self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
vocabulary._column.to_pylibcudf(mode="read")
)

Expand Down
2 changes: 0 additions & 2 deletions python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ cpdef Column character_tokenize(Column input)

cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)

cpdef TokenizeVocabulary load_vocabulary(Column input)

cpdef Column tokenize_with_vocabulary(
Column input,
TokenizeVocabulary vocabulary,
Expand Down
36 changes: 6 additions & 30 deletions python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
input : Column
Strings column to tokenize
delimiter : Scalar
String scalar used to separate individual
strings into tokens
String scalar used to separate individual strings into tokens
Returns
-------
Expand Down Expand Up @@ -106,7 +105,7 @@ cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None):
----------
input : Column
Strings column to count tokens
delimiters : Scalar]
delimiters : Scalar
String scalar used to separate each string into tokens
Returns
Expand Down Expand Up @@ -141,8 +140,7 @@ cpdef Column count_tokens_column(Column input, Column delimiters):
input : Column
Strings column to count tokens
delimiters : Column
Strings column used to separate
each string into tokens
Strings column used to separate each string into tokens
Returns
-------
Expand Down Expand Up @@ -198,11 +196,9 @@ cpdef Column detokenize(
input : Column
Strings column to detokenize
row_indices : Column
The relative output row index assigned
for each token in the input column
The relative output row index assigned for each token in the input column
separator : Scalar
String to append after concatenating
each token to the proper output row
String to append after concatenating each token to the proper output row
Returns
-------
Expand All @@ -225,25 +221,6 @@ cpdef Column detokenize(

return Column.from_libcudf(move(c_result))

cpdef TokenizeVocabulary load_vocabulary(Column input):
"""
Create a ``TokenizeVocabulary`` object from a strings column.
For details, see cpp:func:`cudf::nvtext::load_vocabulary`
Parameters
----------
input : Column
Strings for the vocabulary
Returns
-------
TokenizeVocabulary
Object to be used with cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
"""
return TokenizeVocabulary(input)


cpdef Column tokenize_with_vocabulary(
Column input,
TokenizeVocabulary vocabulary,
Expand All @@ -265,8 +242,7 @@ cpdef Column tokenize_with_vocabulary(
delimiter : Scalar
Used to identify tokens within ``input``
default_id : size_type
The token id to be used for tokens not found
in the vocabulary; Default is -1
The token id to be used for tokens not found in the vocabulary; Default is -1
Returns
-------
Expand Down
11 changes: 3 additions & 8 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,13 @@ def test_detokenize(input_col, delimiter):
assert_column_eq(result, expected)


def test_load_vocabulary(input_col):
result = plc.nvtext.tokenize.load_vocabulary(
plc.interop.from_arrow(input_col)
)
assert isinstance(result, plc.nvtext.tokenize.TokenizeVocabulary)


@pytest.mark.parametrize("default_id", [-1, 0])
def test_tokenize_with_vocabulary(input_col, default_id):
result = plc.nvtext.tokenize.tokenize_with_vocabulary(
plc.interop.from_arrow(input_col),
plc.nvtext.tokenize.load_vocabulary(plc.interop.from_arrow(input_col)),
plc.nvtext.tokenize.TokenizeVocabulary(
plc.interop.from_arrow(input_col)
),
plc.interop.from_arrow(pa.scalar(" ")),
default_id,
)
Expand Down

0 comments on commit 02a50e8

Please sign in to comment.