From a5e9fa4bf95c9ebbe6b0f66acfcdf805a2b315d3 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 May 2023 10:27:25 -0400
Subject: [PATCH] Fix tokenize with non-space delimiter

---
 python/cudf/cudf/core/column/string.py |  5 ++++-
 python/cudf/cudf/tests/test_text.py    | 29 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a3163f1cebe..9319881669f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
             )
         if isinstance(self._parent, cudf.Series):
             result.index = self._parent.index.repeat(  # type: ignore
-                self.token_count()
+                self.token_count(delimiter=delimiter)
             )
         return result
 
@@ -5296,6 +5296,9 @@ def minhash(
 
 
 def _massage_string_arg(value, name, allow_col=False):
+    if isinstance(value, cudf.Scalar):
+        return value
+
     if isinstance(value, str):
         return cudf.Scalar(value, dtype="str")
 
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 899248513de..f0e0e52142f 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -52,6 +52,35 @@ def test_tokenize():
     assert_eq(expected, actual)
 
 
+def test_tokenize_delimiter():
+    strings = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    expected_values = cudf.Series(
+        [
+            "the quick f",
+            "x jumped ",
+            "ver the lazy d",
+            "g",
+            "the siamésé cat jumped under the s",
+            "fa",
+        ]
+    )
+    expected_index = strings.index.repeat(strings.str.token_count("o"))
+    expected = cudf.Series(expected_values, index=expected_index)
+
+    actual = strings.str.tokenize(delimiter="o")
+
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_detokenize():
     strings = cudf.Series(
         [