Remove nvtext::load_vocabulary from pylibcudf (#17220)

This PR follow up #17100 to address the last review here #17100 (review) Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17220
rapidsai · Oct 31, 2024 · 02a50e8 · 02a50e8
1 parent f7020f1
commit 02a50e8
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 42 deletions.
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.nvtext.tokenize import (
-    TokenizeVocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
 
@@ -20,7 +21,7 @@ class TokenizeVocabulary:
     """
 
     def __init__(self, vocabulary: "cudf.Series"):
-        self.vocabulary = cpp_tokenize_vocabulary(
+        self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
 

diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -21,8 +21,6 @@ cpdef Column character_tokenize(Column input)
 
 cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)
 
-cpdef TokenizeVocabulary load_vocabulary(Column input)
-
 cpdef Column tokenize_with_vocabulary(
     Column input,
     TokenizeVocabulary vocabulary,

diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -43,8 +43,7 @@ cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
     input : Column
         Strings column to tokenize
     delimiter : Scalar
-        String scalar used to separate individual
-        strings into tokens
+        String scalar used to separate individual strings into tokens
 
     Returns
     -------
@@ -106,7 +105,7 @@ cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None):
     ----------
     input : Column
         Strings column to count tokens
-    delimiters : Scalar]
+    delimiters : Scalar
         String scalar used to separate each string into tokens
 
     Returns
@@ -141,8 +140,7 @@ cpdef Column count_tokens_column(Column input, Column delimiters):
     input : Column
         Strings column to count tokens
     delimiters : Column
-        Strings column used to separate
-        each string into tokens
+        Strings column used to separate each string into tokens
 
     Returns
     -------
@@ -198,11 +196,9 @@ cpdef Column detokenize(
     input : Column
         Strings column to detokenize
     row_indices : Column
-        The relative output row index assigned
-        for each token in the input column
+        The relative output row index assigned for each token in the input column
     separator : Scalar
-        String to append after concatenating
-        each token to the proper output row
+        String to append after concatenating each token to the proper output row
 
     Returns
     -------
@@ -225,25 +221,6 @@ cpdef Column detokenize(
 
     return Column.from_libcudf(move(c_result))
 
-cpdef TokenizeVocabulary load_vocabulary(Column input):
-    """
-    Create a ``TokenizeVocabulary`` object from a strings column.
-
-    For details, see cpp:func:`cudf::nvtext::load_vocabulary`
-
-    Parameters
-    ----------
-    input : Column
-        Strings for the vocabulary
-
-    Returns
-    -------
-    TokenizeVocabulary
-        Object to be used with cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
-    """
-    return TokenizeVocabulary(input)
-
-
 cpdef Column tokenize_with_vocabulary(
     Column input,
     TokenizeVocabulary vocabulary,
@@ -265,8 +242,7 @@ cpdef Column tokenize_with_vocabulary(
     delimiter : Scalar
         Used to identify tokens within ``input``
     default_id : size_type
-        The token id to be used for tokens not found
-        in the vocabulary; Default is -1
+        The token id to be used for tokens not found in the vocabulary; Default is -1
 
     Returns
     -------

diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
@@ -78,18 +78,13 @@ def test_detokenize(input_col, delimiter):
     assert_column_eq(result, expected)
 
 
-def test_load_vocabulary(input_col):
-    result = plc.nvtext.tokenize.load_vocabulary(
-        plc.interop.from_arrow(input_col)
-    )
-    assert isinstance(result, plc.nvtext.tokenize.TokenizeVocabulary)
-
-
 @pytest.mark.parametrize("default_id", [-1, 0])
 def test_tokenize_with_vocabulary(input_col, default_id):
     result = plc.nvtext.tokenize.tokenize_with_vocabulary(
         plc.interop.from_arrow(input_col),
-        plc.nvtext.tokenize.load_vocabulary(plc.interop.from_arrow(input_col)),
+        plc.nvtext.tokenize.TokenizeVocabulary(
+            plc.interop.from_arrow(input_col)
+        ),
         plc.interop.from_arrow(pa.scalar(" ")),
         default_id,
     )