Make string methods return a Series with a useful Index (#12814)

Closes #12806 Many string methods like `character_ngrams` currently return a `Series` with the default index (`RangeIndex`). This PR makes it so that the index of the result corresponds to the index of the input. More specifically, this PR changes the index of the result of the following string methods: - [x] `character_ngrams` - [x] `tokenize` - [x] `character_tokenize` Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: #12814
rapidsai · Mar 9, 2023 · 02d3751 · 02d3751
1 parent f5bb7b1
commit 02d3751
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 58 deletions.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -4564,22 +4564,22 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         >>> ser = cudf.Series(data)
         >>> ser.str.tokenize()
         0      hello
+        0      world
+        1    goodbye
         1      world
+        2      hello
         2    goodbye
-        3      world
-        4      hello
-        5    goodbye
         dtype: object
         """
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
         if isinstance(delimiter, Column):
-            return self._return_or_inplace(
+            result = self._return_or_inplace(
                 libstrings._tokenize_column(self._column, delimiter),
                 retain_index=False,
             )
         elif isinstance(delimiter, cudf.Scalar):
-            return self._return_or_inplace(
+            result = self._return_or_inplace(
                 libstrings._tokenize_scalar(self._column, delimiter),
                 retain_index=False,
             )
@@ -4588,6 +4588,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
                 f"Expected a Scalar or Column\
                 for delimiters, but got {type(delimiter)}"
             )
+        if isinstance(self._parent, cudf.Series):
+            result.index = self._parent.index.repeat(  # type: ignore
+                self.token_count()
+            )
+        return result
 
     def detokenize(
         self, indices: "cudf.Series", separator: str = " "
@@ -4641,41 +4646,43 @@ def character_tokenize(self) -> SeriesOrIndex:
         >>> data = ["hello world", None, "goodbye, thank you."]
         >>> ser = cudf.Series(data)
         >>> ser.str.character_tokenize()
-        0     h
-        1     e
-        2     l
-        3     l
-        4     o
-        5
-        6     w
-        7     o
-        8     r
-        9     l
-        10    d
-        11    g
-        12    o
-        13    o
-        14    d
-        15    b
-        16    y
-        17    e
-        18    ,
-        19
-        20    t
-        21    h
-        22    a
-        23    n
-        24    k
-        25
-        26    y
-        27    o
-        28    u
-        29    .
+        0    h
+        0    e
+        0    l
+        0    l
+        0    o
+        0
+        0    w
+        0    o
+        0    r
+        0    l
+        0    d
+        2    g
+        2    o
+        2    o
+        2    d
+        2    b
+        2    y
+        2    e
+        2    ,
+        2
+        2    t
+        2    h
+        2    a
+        2    n
+        2    k
+        2
+        2    y
+        2    o
+        2    u
+        2    .
         dtype: object
         """
         result_col = libstrings.character_tokenize(self._column)
         if isinstance(self._parent, cudf.Series):
-            return cudf.Series(result_col, name=self._parent.name)
+            lengths = self.len().fillna(0)
+            index = self._parent.index.repeat(lengths)
+            return cudf.Series(result_col, name=self._parent.name, index=index)
         elif isinstance(self._parent, cudf.BaseIndex):
             return cudf.core.index.as_index(result_col, name=self._parent.name)
         else:
@@ -4780,20 +4787,20 @@ def character_ngrams(
         >>> str_series = cudf.Series(['abcd','efgh','xyz'])
         >>> str_series.str.character_ngrams(2)
         0    ab
-        1    bc
-        2    cd
-        3    ef
-        4    fg
-        5    gh
-        6    xy
-        7    yz
+        0    bc
+        0    cd
+        1    ef
+        1    fg
+        1    gh
+        2    xy
+        2    yz
         dtype: object
         >>> str_series.str.character_ngrams(3)
         0    abc
-        1    bcd
-        2    efg
-        3    fgh
-        4    xyz
+        0    bcd
+        1    efg
+        1    fgh
+        2    xyz
         dtype: object
         >>> str_series.str.character_ngrams(3,True)
         0    [abc, bcd]
@@ -4802,8 +4809,6 @@ def character_ngrams(
         dtype: list
         """
         ngrams = libstrings.generate_character_ngrams(self._column, n)
-        if as_list is False:
-            return self._return_or_inplace(ngrams, retain_index=False)
 
         # convert the output to a list by just generating the
         # offsets for the output list column
@@ -4820,7 +4825,11 @@ def character_ngrams(
             null_count=self._column.null_count,
             children=(oc, ngrams),
         )
-        return self._return_or_inplace(lc, retain_index=False)
+        result = self._return_or_inplace(lc, retain_index=True)
+
+        if isinstance(result, cudf.Series) and not as_list:
+            return result.explode()
+        return result
 
     def ngrams_tokenize(
         self, n: int = 2, delimiter: str = " ", separator: str = "_"

diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 from io import StringIO
 
@@ -24,7 +24,7 @@ def test_tokenize():
         ]
     )
 
-    expected = cudf.Series(
+    expected_values = cudf.Series(
         [
             "the",
             "quick",
@@ -43,6 +43,8 @@ def test_tokenize():
             "sofa",
         ]
     )
+    expected_index = strings.index.repeat(strings.str.token_count())
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = strings.str.tokenize()
 
@@ -231,7 +233,7 @@ def test_ngrams(n, separator, expected_values):
 
 
 @pytest.mark.parametrize(
-    "n, expected_values, as_list",
+    "n, expected_values, expected_index, as_list",
     [
         (
             2,
@@ -247,21 +249,41 @@ def test_ngrams(n, separator, expected_values):
                 "he",
                 "er",
                 "re",
+                cudf.NA,
             ],
+            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
+            False,
+        ),
+        (
+            3,
+            [
+                "thi",
+                "his",
+                cudf.NA,
+                cudf.NA,
+                "boo",
+                "ook",
+                "her",
+                "ere",
+                cudf.NA,
+            ],
+            [1, 1, 2, 3, 4, 4, 5, 5, 6],
             False,
         ),
-        (3, ["thi", "his", "boo", "ook", "her", "ere"], False),
         (
             3,
             [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []],
+            [1, 2, 3, 4, 5, 6],
             True,
         ),
     ],
 )
-def test_character_ngrams(n, expected_values, as_list):
-    strings = cudf.Series(["this", "is", "my", "book", "here", ""])
+def test_character_ngrams(n, expected_values, expected_index, as_list):
+    strings = cudf.Series(
+        ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6]
+    )
 
-    expected = cudf.Series(expected_values)
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = strings.str.character_ngrams(n=n, as_list=as_list)
 
@@ -314,7 +336,7 @@ def test_character_tokenize_series():
             ),
         ]
     )
-    expected = cudf.Series(
+    expected_values = cudf.Series(
         [
             "h",
             "e",
@@ -402,6 +424,8 @@ def test_character_tokenize_series():
             "Ǆ",
         ]
     )
+    expected_index = sr.index.repeat(sr.str.len().fillna(0))
+    expected = cudf.Series(expected_values, index=expected_index)
 
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)