Fix tokenize with non-space delimiter (#13403)

Closes #13399 Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #13403
rapidsai · May 23, 2023 · 7660af0 · 7660af0
1 parent 12acf92
commit 7660af0
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 1 deletion.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
             )
         if isinstance(self._parent, cudf.Series):
             result.index = self._parent.index.repeat(  # type: ignore
-                self.token_count()
+                self.token_count(delimiter=delimiter)
             )
         return result
 
@@ -5296,6 +5296,9 @@ def minhash(
 
 
 def _massage_string_arg(value, name, allow_col=False):
+    if isinstance(value, cudf.Scalar):
+        return value
+
     if isinstance(value, str):
         return cudf.Scalar(value, dtype="str")
 

diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
@@ -52,6 +52,35 @@ def test_tokenize():
     assert_eq(expected, actual)
 
 
+def test_tokenize_delimiter():
+    strings = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    expected_values = cudf.Series(
+        [
+            "the quick f",
+            "x jumped ",
+            "ver the lazy d",
+            "g",
+            "the siamésé cat jumped under the s",
+            "fa",
+        ]
+    )
+    expected_index = strings.index.repeat(strings.str.token_count("o"))
+    expected = cudf.Series(expected_values, index=expected_index)
+
+    actual = strings.str.tokenize(delimiter="o")
+
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_detokenize():
     strings = cudf.Series(
         [