Skip to content

Commit

Permalink
Fix tokenize with non-space delimiter (#13403)
Browse files Browse the repository at this point in the history
Closes #13399

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #13403
  • Loading branch information
shwina authored May 23, 2023
1 parent 12acf92 commit 7660af0
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
)
if isinstance(self._parent, cudf.Series):
result.index = self._parent.index.repeat( # type: ignore
self.token_count()
self.token_count(delimiter=delimiter)
)
return result

Expand Down Expand Up @@ -5296,6 +5296,9 @@ def minhash(


def _massage_string_arg(value, name, allow_col=False):
if isinstance(value, cudf.Scalar):
return value

if isinstance(value, str):
return cudf.Scalar(value, dtype="str")

Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,35 @@ def test_tokenize():
assert_eq(expected, actual)


def test_tokenize_delimiter():
strings = cudf.Series(
[
"the quick fox jumped over the lazy dog",
"the siamésé cat jumped under the sofa",
None,
"",
]
)

expected_values = cudf.Series(
[
"the quick f",
"x jumped ",
"ver the lazy d",
"g",
"the siamésé cat jumped under the s",
"fa",
]
)
expected_index = strings.index.repeat(strings.str.token_count("o"))
expected = cudf.Series(expected_values, index=expected_index)

actual = strings.str.tokenize(delimiter="o")

assert type(expected) == type(actual)
assert_eq(expected, actual)


def test_detokenize():
strings = cudf.Series(
[
Expand Down

0 comments on commit 7660af0

Please sign in to comment.