rapidsai · rapids-bot · May 3, 2021 · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -180,6 +180,7 @@ else
 
     gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
     conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+    conda install -c conda-forge tokenizers
 
     install_dask
 

diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst
@@ -206,6 +206,12 @@ Window
 .. autoclass:: Rolling
     :members:
 
+SubwordTokenizer
+----------------
+.. currentmodule:: cudf.core.subword_tokenizer
+
+.. autoclass:: SubwordTokenizer
+    :members:
 
 General utility functions
 -------------------------

@@ -3,7 +3,8 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint16_t, uint32_t
+
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -17,6 +18,31 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         unique_ptr[column] tensor_attention_mask
         unique_ptr[column] tensor_metadata
 
+    cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
+        uint16_t first_token_id
+        uint16_t separator_token_id
+        uint16_t unknown_token_id
+        uint32_t outer_hash_a
+        uint32_t outer_hash_b
+        uint16_t num_bin
+        unique_ptr[column] table
+        unique_ptr[column] bin_coefficients
+        unique_ptr[column] bin_offsets
+
+    cdef  unique_ptr[hashed_vocabulary] load_vocabulary_file(
+         const string &filename_hashed_vocabulary
+    ) except +
+
+    cdef tokenizer_result subword_tokenize(
+        const column_view & strings,
+        hashed_vocabulary & hashed_vocablary_obj,
+        uint32_t max_sequence_length,
+        uint32_t stride,
+        bool do_lower,
+        bool do_truncate,
+        uint32_t max_rows_tensor
+    ) except +
+
     cdef tokenizer_result subword_tokenize(
         const column_view &strings,
         const string &filename_hashed_vocabulary,

@@ -11,25 +11,63 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.subword_tokenize cimport (
     subword_tokenize as cpp_subword_tokenize,
+    hashed_vocabulary as cpp_hashed_vocabulary,
+    load_vocabulary_file as cpp_load_vocabulary_file,
     tokenizer_result as cpp_tokenizer_result,
-    move as tr_move
+    move as tr_move,
 )
 from cudf._lib.column cimport Column
 
 
-def subword_tokenize(
+cdef class Hashed_Vocabulary:
+    cdef unique_ptr[cpp_hashed_vocabulary] c_obj
+
+    def __cinit__(self, hash_file):
+        cdef string c_hash_file = <string>str(hash_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+
+
+def subword_tokenize_inmem_hash(Column strings,
+    Hashed_Vocabulary hashed_vocablary,
+    uint32_t max_sequence_length=64,
+    uint32_t stride=48,
+    bool do_lower=True,
+    bool do_truncate=False,
+    uint32_t max_rows_tensor=500):
+    cdef column_view c_strings = strings.view()
+    cdef cpp_tokenizer_result c_result
+    with nogil:
+        c_result = tr_move(
+            cpp_subword_tokenize(
+                c_strings,
+                hashed_vocablary.c_obj.get()[0],
+                max_sequence_length,
+                stride,
+                do_lower,
+                do_truncate,
+                max_rows_tensor
+            )
+        )
+    # return the 3 tensor components
+    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
+    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
+    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
+    return tokens, masks, metadata
+
+
+def subword_tokenize_vocab_file(
     Column strings,
     object   hash_file,
     uint32_t max_sequence_length=64,
     uint32_t stride=48,
     bool do_lower=True,
     bool do_truncate=False,
     uint32_t max_rows_tensor=500
-):
+    ):
     cdef column_view c_strings = strings.view()
-    cdef string c_hash_file = <string>str(hash_file).encode()
     cdef cpp_tokenizer_result c_result
-
+    cdef string c_hash_file = <string>str(hash_file).encode()
     with nogil:
         c_result = tr_move(
             cpp_subword_tokenize(
@@ -42,7 +80,6 @@ def subword_tokenize(
                 max_rows_tensor
             )
         )
-
     # return the 3 tensor components
     tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
     masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))

@@ -40,7 +40,7 @@
     porter_stemmer_measure as cpp_porter_stemmer_measure,
 )
 from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize as cpp_subword_tokenize,
+    subword_tokenize_vocab_file as cpp_subword_tokenize_vocab_file,
 )
 from cudf._lib.nvtext.tokenize import (
     _count_tokens_column as cpp_count_tokens_column,
@@ -4435,7 +4435,7 @@ def subword_tokenize(
         array([[0, 0, 2],
                [1, 0, 1]], dtype=uint32)
         """
-        tokens, masks, metadata = cpp_subword_tokenize(
+        tokens, masks, metadata = cpp_subword_tokenize_vocab_file(
             self._column,
             hash_file,
             max_length,