diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d5ef5fb5d11..d9a6c6c4cd6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4564,22 +4564,22 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: >>> ser = cudf.Series(data) >>> ser.str.tokenize() 0 hello + 0 world + 1 goodbye 1 world + 2 hello 2 goodbye - 3 world - 4 hello - 5 goodbye dtype: object """ delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delimiter, Column): - return self._return_or_inplace( + result = self._return_or_inplace( libstrings._tokenize_column(self._column, delimiter), retain_index=False, ) elif isinstance(delimiter, cudf.Scalar): - return self._return_or_inplace( + result = self._return_or_inplace( libstrings._tokenize_scalar(self._column, delimiter), retain_index=False, ) @@ -4588,6 +4588,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: f"Expected a Scalar or Column\ for delimiters, but got {type(delimiter)}" ) + if isinstance(self._parent, cudf.Series): + result.index = self._parent.index.repeat( # type: ignore + self.token_count() + ) + return result def detokenize( self, indices: "cudf.Series", separator: str = " " @@ -4641,41 +4646,43 @@ def character_tokenize(self) -> SeriesOrIndex: >>> data = ["hello world", None, "goodbye, thank you."] >>> ser = cudf.Series(data) >>> ser.str.character_tokenize() - 0 h - 1 e - 2 l - 3 l - 4 o - 5 - 6 w - 7 o - 8 r - 9 l - 10 d - 11 g - 12 o - 13 o - 14 d - 15 b - 16 y - 17 e - 18 , - 19 - 20 t - 21 h - 22 a - 23 n - 24 k - 25 - 26 y - 27 o - 28 u - 29 . + 0 h + 0 e + 0 l + 0 l + 0 o + 0 + 0 w + 0 o + 0 r + 0 l + 0 d + 2 g + 2 o + 2 o + 2 d + 2 b + 2 y + 2 e + 2 , + 2 + 2 t + 2 h + 2 a + 2 n + 2 k + 2 + 2 y + 2 o + 2 u + 2 . dtype: object """ result_col = libstrings.character_tokenize(self._column) if isinstance(self._parent, cudf.Series): - return cudf.Series(result_col, name=self._parent.name) + lengths = self.len().fillna(0) + index = self._parent.index.repeat(lengths) + return cudf.Series(result_col, name=self._parent.name, index=index) elif isinstance(self._parent, cudf.BaseIndex): return cudf.core.index.as_index(result_col, name=self._parent.name) else: @@ -4780,20 +4787,20 @@ def character_ngrams( >>> str_series = cudf.Series(['abcd','efgh','xyz']) >>> str_series.str.character_ngrams(2) 0 ab - 1 bc - 2 cd - 3 ef - 4 fg - 5 gh - 6 xy - 7 yz + 0 bc + 0 cd + 1 ef + 1 fg + 1 gh + 2 xy + 2 yz dtype: object >>> str_series.str.character_ngrams(3) 0 abc - 1 bcd - 2 efg - 3 fgh - 4 xyz + 0 bcd + 1 efg + 1 fgh + 2 xyz dtype: object >>> str_series.str.character_ngrams(3,True) 0 [abc, bcd] @@ -4802,8 +4809,6 @@ def character_ngrams( dtype: list """ ngrams = libstrings.generate_character_ngrams(self._column, n) - if as_list is False: - return self._return_or_inplace(ngrams, retain_index=False) # convert the output to a list by just generating the # offsets for the output list column @@ -4820,7 +4825,11 @@ def character_ngrams( null_count=self._column.null_count, children=(oc, ngrams), ) - return self._return_or_inplace(lc, retain_index=False) + result = self._return_or_inplace(lc, retain_index=True) + + if isinstance(result, cudf.Series) and not as_list: + return result.explode() + return result def ngrams_tokenize( self, n: int = 2, delimiter: str = " ", separator: str = "_" diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 627bf0a68bb..89c428551e4 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from io import StringIO @@ -24,7 +24,7 @@ def test_tokenize(): ] ) - expected = cudf.Series( + expected_values = cudf.Series( [ "the", "quick", @@ -43,6 +43,8 @@ def test_tokenize(): "sofa", ] ) + expected_index = strings.index.repeat(strings.str.token_count()) + expected = cudf.Series(expected_values, index=expected_index) actual = strings.str.tokenize() @@ -231,7 +233,7 @@ def test_ngrams(n, separator, expected_values): @pytest.mark.parametrize( - "n, expected_values, as_list", + "n, expected_values, expected_index, as_list", [ ( 2, @@ -247,21 +249,41 @@ def test_ngrams(n, separator, expected_values): "he", "er", "re", + cudf.NA, ], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + False, + ), + ( + 3, + [ + "thi", + "his", + cudf.NA, + cudf.NA, + "boo", + "ook", + "her", + "ere", + cudf.NA, + ], + [1, 1, 2, 3, 4, 4, 5, 5, 6], False, ), - (3, ["thi", "his", "boo", "ook", "her", "ere"], False), ( 3, [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], + [1, 2, 3, 4, 5, 6], True, ), ], ) -def test_character_ngrams(n, expected_values, as_list): - strings = cudf.Series(["this", "is", "my", "book", "here", ""]) +def test_character_ngrams(n, expected_values, expected_index, as_list): + strings = cudf.Series( + ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] + ) - expected = cudf.Series(expected_values) + expected = cudf.Series(expected_values, index=expected_index) actual = strings.str.character_ngrams(n=n, as_list=as_list) @@ -314,7 +336,7 @@ def test_character_tokenize_series(): ), ] ) - expected = cudf.Series( + expected_values = cudf.Series( [ "h", "e", @@ -402,6 +424,8 @@ def test_character_tokenize_series(): "DŽ", ] ) + expected_index = sr.index.repeat(sr.str.len().fillna(0)) + expected = cudf.Series(expected_values, index=expected_index) actual = sr.str.character_tokenize() assert_eq(expected, actual)