diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0902167be8b..e17ecec766a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4620,6 +4620,9 @@ def subword_tokenize( This function requires about 21x the number of character bytes in the input strings column as working memory. + ``ser.str.subword_tokenize`` will be depreciated in future versions. + Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead. + Parameters ---------- hash_file : str @@ -4691,6 +4694,14 @@ def subword_tokenize( array([[0, 0, 2], [1, 0, 1]], dtype=uint32) """ + warning_message = ( + "`ser.str.subword_tokenize` API will be depreciated" + " in future versions of cudf.\n" + "Use `cudf.core.subword_tokenizer.SubwordTokenizer` " + "instead" + ) + + warnings.warn(warning_message, FutureWarning) tokens, masks, metadata = cpp_subword_tokenize_vocab_file( self._column, hash_file,