rapidsai · rapids-bot · May 23, 2023 · May 22, 2023 · May 23, 2023
@@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
             )
         if isinstance(self._parent, cudf.Series):
             result.index = self._parent.index.repeat(  # type: ignore
-                self.token_count()
+                self.token_count(delimiter=delimiter)
             )
         return result
 
@@ -5296,6 +5296,9 @@ def minhash(
 
 
 def _massage_string_arg(value, name, allow_col=False):
+    if isinstance(value, cudf.Scalar):
+        return value
+
     if isinstance(value, str):
         return cudf.Scalar(value, dtype="str")
 

@@ -52,6 +52,35 @@ def test_tokenize():
     assert_eq(expected, actual)
 
 
+def test_tokenize_delimiter():
+    strings = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    expected_values = cudf.Series(
+        [
+            "the quick f",
+            "x jumped ",
+            "ver the lazy d",
+            "g",
+            "the siamésé cat jumped under the s",
+            "fa",
+        ]
+    )
+    expected_index = strings.index.repeat(strings.str.token_count("o"))
+    expected = cudf.Series(expected_values, index=expected_index)
+
+    actual = strings.str.tokenize(delimiter="o")
+
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_detokenize():
     strings = cudf.Series(
         [