From 50f1bf5fa2d3c86b305f4241ddb1814ddbce181c Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 21 Feb 2023 12:54:36 -0500
Subject: [PATCH 1/4] Make character_ngrams return the right index

---
 python/cudf/cudf/core/column/string.py |  8 ++++---
 python/cudf/cudf/tests/test_text.py    | 32 +++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 8d6ffe48957..112aba0f8a5 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4813,8 +4813,6 @@ def character_ngrams(
         dtype: list
         """
         ngrams = libstrings.generate_character_ngrams(self._column, n)
-        if as_list is False:
-            return self._return_or_inplace(ngrams, retain_index=False)
 
         # convert the output to a list by just generating the
         # offsets for the output list column
@@ -4831,7 +4829,11 @@ def character_ngrams(
             null_count=self._column.null_count,
             children=(oc, ngrams),
         )
-        return self._return_or_inplace(lc, retain_index=False)
+        result = self._return_or_inplace(lc, retain_index=True)
+
+        if isinstance(result, cudf.Series) and not as_list:
+            return result.explode()
+        return result
 
     def ngrams_tokenize(
         self, n: int = 2, delimiter: str = " ", separator: str = "_"
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 627bf0a68bb..2fc7d0c6c1e 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 from io import StringIO
 
@@ -231,7 +231,7 @@ def test_ngrams(n, separator, expected_values):
 
 
 @pytest.mark.parametrize(
-    "n, expected_values, as_list",
+    "n, expected_values, expected_index, as_list",
     [
         (
             2,
@@ -247,21 +247,41 @@ def test_ngrams(n, separator, expected_values):
                 "he",
                 "er",
                 "re",
+                cudf.NA,
             ],
+            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
+            False,
+        ),
+        (
+            3,
+            [
+                "thi",
+                "his",
+                cudf.NA,
+                cudf.NA,
+                "boo",
+                "ook",
+                "her",
+                "ere",
+                cudf.NA,
+            ],
+            [1, 1, 2, 3, 4, 4, 5, 5, 6],
             False,
         ),
-        (3, ["thi", "his", "boo", "ook", "her", "ere"], False),
         (
             3,
             [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []],
+            [1, 2, 3, 4, 5, 6],
             True,
         ),
     ],
 )
-def test_character_ngrams(n, expected_values, as_list):
-    strings = cudf.Series(["this", "is", "my", "book", "here", ""])
+def test_character_ngrams(n, expected_values, expected_index, as_list):
+    strings = cudf.Series(
+        ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6]
+    )
 
-    expected = cudf.Series(expected_values)
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = strings.str.character_ngrams(n=n, as_list=as_list)
 

From b8e7cd53750bdda05a53cb63fe4350aa27c5bbaf Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:38:26 -0500
Subject: [PATCH 2/4] Fix index for character_tokenize

---
 python/cudf/cudf/core/column/string.py | 4 +++-
 python/cudf/cudf/tests/test_text.py    | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4e2b6989370..c11be242218 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4675,7 +4675,9 @@ def character_tokenize(self) -> SeriesOrIndex:
         """
         result_col = libstrings.character_tokenize(self._column)
         if isinstance(self._parent, cudf.Series):
-            return cudf.Series(result_col, name=self._parent.name)
+            lengths = self.len().fillna(0)
+            index = self._parent.index.repeat(lengths)
+            return cudf.Series(result_col, name=self._parent.name, index=index)
         elif isinstance(self._parent, cudf.BaseIndex):
             return cudf.core.index.as_index(result_col, name=self._parent.name)
         else:
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 2fc7d0c6c1e..f0d5684280a 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -334,7 +334,7 @@ def test_character_tokenize_series():
             ),
         ]
     )
-    expected = cudf.Series(
+    expected_values = cudf.Series(
         [
             "h",
             "e",
@@ -422,6 +422,8 @@ def test_character_tokenize_series():
             "Ǆ",
         ]
     )
+    expected_index = sr.index.repeat(sr.str.len().fillna(0))
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)

From 55578e124ca4b21e071aa76af9d2e72ef1dbe666 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:52:58 -0500
Subject: [PATCH 3/4] tokenize

---
 python/cudf/cudf/core/column/string.py | 9 +++++++--
 python/cudf/cudf/tests/test_text.py    | 4 +++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c11be242218..b58e47246c6 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4574,12 +4574,12 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
         if isinstance(delimiter, Column):
-            return self._return_or_inplace(
+            result = self._return_or_inplace(
                 libstrings._tokenize_column(self._column, delimiter),
                 retain_index=False,
             )
         elif isinstance(delimiter, cudf.Scalar):
-            return self._return_or_inplace(
+            result = self._return_or_inplace(
                 libstrings._tokenize_scalar(self._column, delimiter),
                 retain_index=False,
             )
@@ -4588,6 +4588,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
                 f"Expected a Scalar or Column\
                 for delimiters, but got {type(delimiter)}"
             )
+        if isinstance(self._parent, cudf.Series):
+            result.index = self._parent.index.repeat(  # type: ignore
+                self.token_count()
+            )
+        return result
 
     def detokenize(
         self, indices: "cudf.Series", separator: str = " "
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index f0d5684280a..89c428551e4 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -24,7 +24,7 @@ def test_tokenize():
         ]
     )
 
-    expected = cudf.Series(
+    expected_values = cudf.Series(
         [
             "the",
             "quick",
@@ -43,6 +43,8 @@ def test_tokenize():
             "sofa",
         ]
     )
+    expected_index = strings.index.repeat(strings.str.token_count())
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = strings.str.tokenize()
 

From b0ec6de2dd311b2efdf83b1358fcf4ad7b2bfca3 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 7 Mar 2023 10:57:08 -0500
Subject: [PATCH 4/4] docs

---
 python/cudf/cudf/core/column/string.py | 88 +++++++++++++-------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b58e47246c6..d9a6c6c4cd6 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4564,11 +4564,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         >>> ser = cudf.Series(data)
         >>> ser.str.tokenize()
         0      hello
+        0      world
+        1    goodbye
         1      world
+        2      hello
         2    goodbye
-        3      world
-        4      hello
-        5    goodbye
         dtype: object
         """
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
@@ -4646,36 +4646,36 @@ def character_tokenize(self) -> SeriesOrIndex:
         >>> data = ["hello world", None, "goodbye, thank you."]
         >>> ser = cudf.Series(data)
         >>> ser.str.character_tokenize()
-        0     h
-        1     e
-        2     l
-        3     l
-        4     o
-        5
-        6     w
-        7     o
-        8     r
-        9     l
-        10    d
-        11    g
-        12    o
-        13    o
-        14    d
-        15    b
-        16    y
-        17    e
-        18    ,
-        19
-        20    t
-        21    h
-        22    a
-        23    n
-        24    k
-        25
-        26    y
-        27    o
-        28    u
-        29    .
+        0    h
+        0    e
+        0    l
+        0    l
+        0    o
+        0
+        0    w
+        0    o
+        0    r
+        0    l
+        0    d
+        2    g
+        2    o
+        2    o
+        2    d
+        2    b
+        2    y
+        2    e
+        2    ,
+        2
+        2    t
+        2    h
+        2    a
+        2    n
+        2    k
+        2
+        2    y
+        2    o
+        2    u
+        2    .
         dtype: object
         """
         result_col = libstrings.character_tokenize(self._column)
@@ -4787,20 +4787,20 @@ def character_ngrams(
         >>> str_series = cudf.Series(['abcd','efgh','xyz'])
         >>> str_series.str.character_ngrams(2)
         0    ab
-        1    bc
-        2    cd
-        3    ef
-        4    fg
-        5    gh
-        6    xy
-        7    yz
+        0    bc
+        0    cd
+        1    ef
+        1    fg
+        1    gh
+        2    xy
+        2    yz
         dtype: object
         >>> str_series.str.character_ngrams(3)
         0    abc
-        1    bcd
-        2    efg
-        3    fgh
-        4    xyz
+        0    bcd
+        1    efg
+        1    fgh
+        2    xyz
         dtype: object
         >>> str_series.str.character_ngrams(3,True)
         0    [abc, bcd]