rapidsai · rapids-bot · Jan 5, 2022 · Jan 4, 2022 · Jan 4, 2022 · Jan 4, 2022
@@ -130,9 +130,7 @@ struct tokenizer_result {
  *        larger than the max value for cudf::size_type
  *
  * @param strings The input strings to tokenize.
- * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
- *        Note that this is the file AFTER python/perfect_hash.py has been used
- *        for preprocessing.
+ * @param vocabulary_table The vocabulary table pre-loaded into this object.
  * @param max_sequence_length Limit of the number of token-ids per row in final tensor
  *        for each string.
  * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride`
@@ -150,25 +148,6 @@ struct tokenizer_result {
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */
-tokenizer_result subword_tokenize(
-  cudf::strings_column_view const& strings,
-  std::string const& filename_hashed_vocabulary,
-  uint32_t max_sequence_length,
-  uint32_t stride,
-  bool do_lower_case,
-  bool do_truncate,
-  uint32_t max_rows_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc subword_tokenize()
- *
- * This function differs from the one above by only the hashed vocabulary parameter.
- * The file can be pre-loaded using the @ref load_vocabulary_file API and then
- * passed in place of the file name in a call to this API.
- *
- * @param vocabulary_table The vocabulary table pre-loaded into this object.
- */
 tokenizer_result subword_tokenize(
   cudf::strings_column_view const& strings,
   hashed_vocabulary const& vocabulary_table,

@@ -249,28 +249,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
 }  // namespace detail
 
-tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
-                                  std::string const& filename_hashed_vocabulary,
-                                  uint32_t max_sequence_length,
-                                  uint32_t stride,
-                                  bool do_lower_case,
-                                  bool do_truncate,
-                                  uint32_t max_rows_tensor,
-                                  rmm::mr::device_memory_resource* mr)
-{
-  auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr);
-  CUDF_FUNC_RANGE();
-  return detail::subword_tokenize(strings,
-                                  *vocab_table,
-                                  max_sequence_length,
-                                  stride,
-                                  do_lower_case,
-                                  do_truncate,
-                                  max_rows_tensor,
-                                  rmm::cuda_stream_default,
-                                  mr);
-}
-
 tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   hashed_vocabulary const& vocabulary_table,
                                   uint32_t max_sequence_length,

@@ -67,12 +67,13 @@ TEST(TextSubwordTest, Tokenize)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 16;
   uint32_t stride              = 16;
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -119,12 +120,13 @@ TEST(TextSubwordTest, TokenizeMultiRow)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 8;
   uint32_t stride              = 6;
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -148,12 +150,13 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
   cudf::test::strings_column_wrapper strings({"This is a test."});
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 5;  // five tokens in strings;
   uint32_t stride              = 5;  // this should not effect the result
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -175,8 +178,10 @@ TEST(TextSubwordTest, ParameterErrors)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                        hash_file,
+                                        *vocab,
                                         12,    // max_sequence_length
                                         13,    // stride <= max_sequence_length
                                         true,  // do_lower_case
@@ -185,7 +190,7 @@ TEST(TextSubwordTest, ParameterErrors)
                cudf::logic_error);
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                        hash_file,
+                                        *vocab,
                                         5,
                                         5,
                                         true,  // do_lower_case
@@ -199,8 +204,9 @@ TEST(TextSubwordTest, EmptyStrings)
   cudf::test::strings_column_wrapper strings;
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          16,
                                          16,
                                          true,   // do_lower_case
@@ -217,8 +223,9 @@ TEST(TextSubwordTest, AllNullStrings)
   cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          16,
                                          16,
                                          true,   // do_lower_case

@@ -58,38 +58,3 @@ def subword_tokenize_inmem_hash(
     masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
     metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
     return tokens, masks, metadata
-
-
-def subword_tokenize_vocab_file(
-    Column strings,
-    object   hash_file,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-    uint32_t max_rows_tensor=500
-):
-    """
-        Subword tokenizes text series by using the hashed vocabulary
-        stored on disk
-    """
-    cdef column_view c_strings = strings.view()
-    cdef cpp_tokenizer_result c_result
-    cdef string c_hash_file = <string>str(hash_file).encode()
-    with nogil:
-        c_result = tr_move(
-            cpp_subword_tokenize(
-                c_strings,
-                c_hash_file,
-                max_sequence_length,
-                stride,
-                do_lower,
-                do_truncate,
-                max_rows_tensor
-            )
-        )
-    # return the 3 tensor components
-    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
-    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
-    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
-    return tokens, masks, metadata
@@ -12,7 +12,6 @@
     is_letter_multi,
     porter_stemmer_measure,
 )
-from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file
 from cudf._lib.nvtext.tokenize import (
     _count_tokens_column,
     _count_tokens_scalar,

@@ -4711,119 +4711,6 @@ def filter_tokens(
             ),
         )
 
-    def subword_tokenize(
-        self,
-        hash_file: str,
-        max_length: int = 64,
-        stride: int = 48,
-        do_lower: bool = True,
-        do_truncate: bool = False,
-        max_rows_tensor: int = 500,
-    ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]:
-        """
-        Run CUDA BERT subword tokenizer on cuDF strings column.
-        Encodes words to token ids using vocabulary from a pretrained
-        tokenizer.
-
-        This function requires about 21x the number of character bytes
-        in the input strings column as working memory.
-
-        ``Series.str.subword_tokenize`` is deprecated and will be removed.
-        Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead.
-
-        Parameters
-        ----------
-        hash_file : str
-            Path to hash file containing vocabulary of words with token-ids.
-            This can be created from the raw vocabulary
-            using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function
-        max_length : int, Default is 64
-            Limits the length of the sequence returned.
-            If tokenized string is shorter than max_length,
-            output will be padded with 0s.
-            If the tokenized string is longer than max_length and
-            do_truncate == False, there will be multiple returned
-            sequences containing the overflowing token-ids.
-        stride : int, Default is 48
-            If do_truncate == False and the tokenized string is larger
-            than max_length, the sequences containing the overflowing
-            token-ids can contain duplicated token-ids from the main
-            sequence. If max_length is equal to stride there are no
-            duplicated-id tokens. If stride is 80% of max_length,
-            20% of the first sequence will be repeated on the second
-            sequence and so on until the entire sentence is encoded.
-        do_lower : bool, Default is True
-            If set to true, original text will be lowercased before encoding.
-        do_truncate : bool, Default is False
-            If set to true, strings will be truncated and padded to
-            max_length. Each input string will result in exactly one output
-            sequence. If set to false, there may be multiple output
-            sequences when the max_length is smaller than generated tokens.
-        max_rows_tensor : int, Default is 500
-            Maximum number of rows for the output token-ids expected
-            to be generated by the tokenizer.
-            Used for allocating temporary working memory on the GPU device.
-            If the output generates a larger number of rows, behavior
-            is undefined.
-            This will vary based on stride, truncation, and max_length.
-            For example, for non-overlapping sequences output rows
-            will be the same as input rows.
-
-        Returns
-        -------
-        token-ids : cupy.ndarray
-            The token-ids for each string padded with 0s to max_length.
-        attention-mask : cupy.ndarray
-            The mask for token-ids result where corresponding positions
-            identify valid token-id values.
-        metadata : cupy.ndarray
-            Each row contains the index id of the original string and the
-            first and last index of the token-ids that are non-padded and
-            non-overlapping.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> from cudf.utils.hash_vocab_utils import hash_vocab
-        >>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt')
-        >>> ser = cudf.Series(['this is the', 'best book'])
-        >>> stride, max_length = 8, 8
-        >>> max_rows_tensor = len(ser)
-        >>> tokens, masks, metadata = ser.str.subword_tokenize('voc_hash.txt',
-        ... max_length=max_length, stride=stride,
-        ... max_rows_tensor=max_rows_tensor)
-        >>> tokens.reshape(-1, max_length)
-        array([[2023, 2003, 1996,    0,    0,    0,    0,    0],
-               [2190, 2338,    0,    0,    0,    0,    0,    0]], dtype=uint32)
-        >>> masks.reshape(-1, max_length)
-        array([[1, 1, 1, 0, 0, 0, 0, 0],
-               [1, 1, 0, 0, 0, 0, 0, 0]], dtype=uint32)
-        >>> metadata.reshape(-1, 3)
-        array([[0, 0, 2],
-               [1, 0, 1]], dtype=uint32)
-        """
-        warnings.warn(
-            "`Series.str.subword_tokenize` is deprecated and will be removed "
-            "in future versions of cudf. Use "
-            "`cudf.core.subword_tokenizer.SubwordTokenizer` instead.",
-            FutureWarning,
-        )
-
-        tokens, masks, metadata = libstrings.subword_tokenize_vocab_file(
-            self._column,
-            hash_file,
-            max_length,
-            stride,
-            do_lower,
-            do_truncate,
-            max_rows_tensor,
-        )
-        return (
-            cupy.asarray(tokens),
-            cupy.asarray(masks),
-            cupy.asarray(metadata),
-        )
-
     def porter_stemmer_measure(self) -> SeriesOrIndex:
         """
         Compute the Porter Stemmer measure for each string.

@@ -21,7 +21,7 @@ def _cast_to_appropriate_type(ar, cast_type):
         from torch.utils.dlpack import from_dlpack
 
     elif cast_type == "tf":
-        from tf.experimental.dlpack import from_dlpack
+        from tensorflow.experimental.dlpack import from_dlpack
 
     return from_dlpack(ar.astype("int32").toDlpack())