From 50f1bf5fa2d3c86b305f4241ddb1814ddbce181c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 21 Feb 2023 12:54:36 -0500 Subject: [PATCH 1/4] Make character_ngrams return the right index --- python/cudf/cudf/core/column/string.py | 8 ++++--- python/cudf/cudf/tests/test_text.py | 32 +++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 8d6ffe48957..112aba0f8a5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4813,8 +4813,6 @@ def character_ngrams( dtype: list """ ngrams = libstrings.generate_character_ngrams(self._column, n) - if as_list is False: - return self._return_or_inplace(ngrams, retain_index=False) # convert the output to a list by just generating the # offsets for the output list column @@ -4831,7 +4829,11 @@ def character_ngrams( null_count=self._column.null_count, children=(oc, ngrams), ) - return self._return_or_inplace(lc, retain_index=False) + result = self._return_or_inplace(lc, retain_index=True) + + if isinstance(result, cudf.Series) and not as_list: + return result.explode() + return result def ngrams_tokenize( self, n: int = 2, delimiter: str = " ", separator: str = "_" diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 627bf0a68bb..2fc7d0c6c1e 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from io import StringIO @@ -231,7 +231,7 @@ def test_ngrams(n, separator, expected_values): @pytest.mark.parametrize( - "n, expected_values, as_list", + "n, expected_values, expected_index, as_list", [ ( 2, @@ -247,21 +247,41 @@ def test_ngrams(n, separator, expected_values): "he", "er", "re", + cudf.NA, ], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + False, + ), + ( + 3, + [ + "thi", + "his", + cudf.NA, + cudf.NA, + "boo", + "ook", + "her", + "ere", + cudf.NA, + ], + [1, 1, 2, 3, 4, 4, 5, 5, 6], False, ), - (3, ["thi", "his", "boo", "ook", "her", "ere"], False), ( 3, [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], + [1, 2, 3, 4, 5, 6], True, ), ], ) -def test_character_ngrams(n, expected_values, as_list): - strings = cudf.Series(["this", "is", "my", "book", "here", ""]) +def test_character_ngrams(n, expected_values, expected_index, as_list): + strings = cudf.Series( + ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] + ) - expected = cudf.Series(expected_values) + expected = cudf.Series(expected_values, index=expected_index) actual = strings.str.character_ngrams(n=n, as_list=as_list) From b8e7cd53750bdda05a53cb63fe4350aa27c5bbaf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 7 Mar 2023 10:38:26 -0500 Subject: [PATCH 2/4] Fix index for character_tokenize --- python/cudf/cudf/core/column/string.py | 4 +++- python/cudf/cudf/tests/test_text.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4e2b6989370..c11be242218 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4675,7 +4675,9 @@ def character_tokenize(self) -> SeriesOrIndex: """ result_col = libstrings.character_tokenize(self._column) if isinstance(self._parent, cudf.Series): - return cudf.Series(result_col, name=self._parent.name) + lengths = self.len().fillna(0) + index = self._parent.index.repeat(lengths) + return cudf.Series(result_col, name=self._parent.name, index=index) elif isinstance(self._parent, cudf.BaseIndex): return cudf.core.index.as_index(result_col, name=self._parent.name) else: diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 2fc7d0c6c1e..f0d5684280a 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -334,7 +334,7 @@ def test_character_tokenize_series(): ), ] ) - expected = cudf.Series( + expected_values = cudf.Series( [ "h", "e", @@ -422,6 +422,8 @@ def test_character_tokenize_series(): "DŽ", ] ) + expected_index = sr.index.repeat(sr.str.len().fillna(0)) + expected = cudf.Series(expected_values, index=expected_index) actual = sr.str.character_tokenize() assert_eq(expected, actual) From 55578e124ca4b21e071aa76af9d2e72ef1dbe666 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 7 Mar 2023 10:52:58 -0500 Subject: [PATCH 3/4] tokenize --- python/cudf/cudf/core/column/string.py | 9 +++++++-- python/cudf/cudf/tests/test_text.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c11be242218..b58e47246c6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4574,12 +4574,12 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delimiter, Column): - return self._return_or_inplace( + result = self._return_or_inplace( libstrings._tokenize_column(self._column, delimiter), retain_index=False, ) elif isinstance(delimiter, cudf.Scalar): - return self._return_or_inplace( + result = self._return_or_inplace( libstrings._tokenize_scalar(self._column, delimiter), retain_index=False, ) @@ -4588,6 +4588,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: f"Expected a Scalar or Column\ for delimiters, but got {type(delimiter)}" ) + if isinstance(self._parent, cudf.Series): + result.index = self._parent.index.repeat( # type: ignore + self.token_count() + ) + return result def detokenize( self, indices: "cudf.Series", separator: str = " " diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index f0d5684280a..89c428551e4 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -24,7 +24,7 @@ def test_tokenize(): ] ) - expected = cudf.Series( + expected_values = cudf.Series( [ "the", "quick", @@ -43,6 +43,8 @@ def test_tokenize(): "sofa", ] ) + expected_index = strings.index.repeat(strings.str.token_count()) + expected = cudf.Series(expected_values, index=expected_index) actual = strings.str.tokenize() From b0ec6de2dd311b2efdf83b1358fcf4ad7b2bfca3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 7 Mar 2023 10:57:08 -0500 Subject: [PATCH 4/4] docs --- python/cudf/cudf/core/column/string.py | 88 +++++++++++++------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b58e47246c6..d9a6c6c4cd6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4564,11 +4564,11 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: >>> ser = cudf.Series(data) >>> ser.str.tokenize() 0 hello + 0 world + 1 goodbye 1 world + 2 hello 2 goodbye - 3 world - 4 hello - 5 goodbye dtype: object """ delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) @@ -4646,36 +4646,36 @@ def character_tokenize(self) -> SeriesOrIndex: >>> data = ["hello world", None, "goodbye, thank you."] >>> ser = cudf.Series(data) >>> ser.str.character_tokenize() - 0 h - 1 e - 2 l - 3 l - 4 o - 5 - 6 w - 7 o - 8 r - 9 l - 10 d - 11 g - 12 o - 13 o - 14 d - 15 b - 16 y - 17 e - 18 , - 19 - 20 t - 21 h - 22 a - 23 n - 24 k - 25 - 26 y - 27 o - 28 u - 29 . + 0 h + 0 e + 0 l + 0 l + 0 o + 0 + 0 w + 0 o + 0 r + 0 l + 0 d + 2 g + 2 o + 2 o + 2 d + 2 b + 2 y + 2 e + 2 , + 2 + 2 t + 2 h + 2 a + 2 n + 2 k + 2 + 2 y + 2 o + 2 u + 2 . dtype: object """ result_col = libstrings.character_tokenize(self._column) @@ -4787,20 +4787,20 @@ def character_ngrams( >>> str_series = cudf.Series(['abcd','efgh','xyz']) >>> str_series.str.character_ngrams(2) 0 ab - 1 bc - 2 cd - 3 ef - 4 fg - 5 gh - 6 xy - 7 yz + 0 bc + 0 cd + 1 ef + 1 fg + 1 gh + 2 xy + 2 yz dtype: object >>> str_series.str.character_ngrams(3) 0 abc - 1 bcd - 2 efg - 3 fgh - 4 xyz + 0 bcd + 1 efg + 1 fgh + 2 xyz dtype: object >>> str_series.str.character_ngrams(3,True) 0 [abc, bcd]