Skip to content

Commit

Permalink
Make string methods return a Series with a useful Index (#12814)
Browse files Browse the repository at this point in the history
Closes #12806

Many string methods like `character_ngrams` currently return a `Series` with the default index (`RangeIndex`). This PR makes it so that the index of the result corresponds to the index of the input.

More specifically, this PR changes the index of the result of the following string methods:

- [x] `character_ngrams`
- [x] `tokenize`
- [x] `character_tokenize`

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #12814
  • Loading branch information
shwina authored Mar 9, 2023
1 parent f5bb7b1 commit 02d3751
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 58 deletions.
109 changes: 59 additions & 50 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4564,22 +4564,22 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
>>> ser = cudf.Series(data)
>>> ser.str.tokenize()
0 hello
0 world
1 goodbye
1 world
2 hello
2 goodbye
3 world
4 hello
5 goodbye
dtype: object
"""
delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)

if isinstance(delimiter, Column):
return self._return_or_inplace(
result = self._return_or_inplace(
libstrings._tokenize_column(self._column, delimiter),
retain_index=False,
)
elif isinstance(delimiter, cudf.Scalar):
return self._return_or_inplace(
result = self._return_or_inplace(
libstrings._tokenize_scalar(self._column, delimiter),
retain_index=False,
)
Expand All @@ -4588,6 +4588,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
f"Expected a Scalar or Column\
for delimiters, but got {type(delimiter)}"
)
if isinstance(self._parent, cudf.Series):
result.index = self._parent.index.repeat( # type: ignore
self.token_count()
)
return result

def detokenize(
self, indices: "cudf.Series", separator: str = " "
Expand Down Expand Up @@ -4641,41 +4646,43 @@ def character_tokenize(self) -> SeriesOrIndex:
>>> data = ["hello world", None, "goodbye, thank you."]
>>> ser = cudf.Series(data)
>>> ser.str.character_tokenize()
0 h
1 e
2 l
3 l
4 o
5
6 w
7 o
8 r
9 l
10 d
11 g
12 o
13 o
14 d
15 b
16 y
17 e
18 ,
19
20 t
21 h
22 a
23 n
24 k
25
26 y
27 o
28 u
29 .
0 h
0 e
0 l
0 l
0 o
0
0 w
0 o
0 r
0 l
0 d
2 g
2 o
2 o
2 d
2 b
2 y
2 e
2 ,
2
2 t
2 h
2 a
2 n
2 k
2
2 y
2 o
2 u
2 .
dtype: object
"""
result_col = libstrings.character_tokenize(self._column)
if isinstance(self._parent, cudf.Series):
return cudf.Series(result_col, name=self._parent.name)
lengths = self.len().fillna(0)
index = self._parent.index.repeat(lengths)
return cudf.Series(result_col, name=self._parent.name, index=index)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.core.index.as_index(result_col, name=self._parent.name)
else:
Expand Down Expand Up @@ -4780,20 +4787,20 @@ def character_ngrams(
>>> str_series = cudf.Series(['abcd','efgh','xyz'])
>>> str_series.str.character_ngrams(2)
0 ab
1 bc
2 cd
3 ef
4 fg
5 gh
6 xy
7 yz
0 bc
0 cd
1 ef
1 fg
1 gh
2 xy
2 yz
dtype: object
>>> str_series.str.character_ngrams(3)
0 abc
1 bcd
2 efg
3 fgh
4 xyz
0 bcd
1 efg
1 fgh
2 xyz
dtype: object
>>> str_series.str.character_ngrams(3,True)
0 [abc, bcd]
Expand All @@ -4802,8 +4809,6 @@ def character_ngrams(
dtype: list
"""
ngrams = libstrings.generate_character_ngrams(self._column, n)
if as_list is False:
return self._return_or_inplace(ngrams, retain_index=False)

# convert the output to a list by just generating the
# offsets for the output list column
Expand All @@ -4820,7 +4825,11 @@ def character_ngrams(
null_count=self._column.null_count,
children=(oc, ngrams),
)
return self._return_or_inplace(lc, retain_index=False)
result = self._return_or_inplace(lc, retain_index=True)

if isinstance(result, cudf.Series) and not as_list:
return result.explode()
return result

def ngrams_tokenize(
self, n: int = 2, delimiter: str = " ", separator: str = "_"
Expand Down
40 changes: 32 additions & 8 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

from io import StringIO

Expand All @@ -24,7 +24,7 @@ def test_tokenize():
]
)

expected = cudf.Series(
expected_values = cudf.Series(
[
"the",
"quick",
Expand All @@ -43,6 +43,8 @@ def test_tokenize():
"sofa",
]
)
expected_index = strings.index.repeat(strings.str.token_count())
expected = cudf.Series(expected_values, index=expected_index)

actual = strings.str.tokenize()

Expand Down Expand Up @@ -231,7 +233,7 @@ def test_ngrams(n, separator, expected_values):


@pytest.mark.parametrize(
"n, expected_values, as_list",
"n, expected_values, expected_index, as_list",
[
(
2,
Expand All @@ -247,21 +249,41 @@ def test_ngrams(n, separator, expected_values):
"he",
"er",
"re",
cudf.NA,
],
[1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
False,
),
(
3,
[
"thi",
"his",
cudf.NA,
cudf.NA,
"boo",
"ook",
"her",
"ere",
cudf.NA,
],
[1, 1, 2, 3, 4, 4, 5, 5, 6],
False,
),
(3, ["thi", "his", "boo", "ook", "her", "ere"], False),
(
3,
[["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []],
[1, 2, 3, 4, 5, 6],
True,
),
],
)
def test_character_ngrams(n, expected_values, as_list):
strings = cudf.Series(["this", "is", "my", "book", "here", ""])
def test_character_ngrams(n, expected_values, expected_index, as_list):
strings = cudf.Series(
["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6]
)

expected = cudf.Series(expected_values)
expected = cudf.Series(expected_values, index=expected_index)

actual = strings.str.character_ngrams(n=n, as_list=as_list)

Expand Down Expand Up @@ -314,7 +336,7 @@ def test_character_tokenize_series():
),
]
)
expected = cudf.Series(
expected_values = cudf.Series(
[
"h",
"e",
Expand Down Expand Up @@ -402,6 +424,8 @@ def test_character_tokenize_series():
"DŽ",
]
)
expected_index = sr.index.repeat(sr.str.len().fillna(0))
expected = cudf.Series(expected_values, index=expected_index)

actual = sr.str.character_tokenize()
assert_eq(expected, actual)
Expand Down

0 comments on commit 02d3751

Please sign in to comment.