From a926c52d58c08657f8d437210ce31fddeaa868e7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 17 Oct 2022 21:12:11 -0500 Subject: [PATCH] Add `.str.find_multiple` API (#11928) Resolves: https://github.com/rapidsai/cudf/issues/10126 This PR adds `.str.find_multiple` API. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/11928 --- docs/cudf/source/api_docs/string_handling.rst | 1 + python/cudf/cudf/_lib/strings/__init__.py | 1 + python/cudf/cudf/core/column/string.py | 64 +++++++++++++++++++ python/cudf/cudf/tests/test_string.py | 61 ++++++++++++++++++ 4 files changed, 127 insertions(+) diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst index 1496d68db6f..2285bb8fb7a 100644 --- a/docs/cudf/source/api_docs/string_handling.rst +++ b/docs/cudf/source/api_docs/string_handling.rst @@ -28,6 +28,7 @@ strings and apply several methods to it. These can be accessed like filter_tokens find findall + find_multiple get get_json_object hex_to_int diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ff558a06d87..22a5066a20e 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -61,6 +61,7 @@ startswith, startswith_multiple, ) +from cudf._lib.strings.find_multiple import find_multiple from cudf._lib.strings.findall import findall from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object from cudf._lib.strings.padding import ( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 287e68531f8..c84e4ff4adb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3623,6 +3623,70 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series": + """ + Find all first occurrences of patterns in the Series/Index. + + Parameters + ---------- + patterns : array-like, Sequence or Series + Patterns to search for in the given Series/Index. + + Returns + ------- + Series + A Series with a list of indices of each pattern's first occurrence. + If a pattern is not found, -1 is returned for that index. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(["strings", "to", "search", "in"]) + >>> s + 0 strings + 1 to + 2 search + 3 in + dtype: object + >>> t = cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]) + >>> t + 0 a + 1 string + 2 g + 3 inn + 4 o + 5 r + 6 sea + dtype: object + >>> s.str.find_multiple(t) + 0 [-1, 0, 5, -1, -1, 2, -1] + 1 [-1, -1, -1, -1, 1, -1, -1] + 2 [2, -1, -1, -1, -1, 3, 0] + 3 [-1, -1, -1, -1, -1, -1, -1] + dtype: list + """ + if can_convert_to_column(patterns): + patterns_column = column.as_column(patterns) + else: + raise TypeError( + "patterns should be an array-like or a Series object, " + f"found {type(patterns)}" + ) + + if not isinstance(patterns_column, StringColumn): + raise TypeError( + "patterns can only be of 'string' dtype, " + f"got: {patterns_column.dtype}" + ) + + return cudf.Series( + libstrings.find_multiple(self._column, patterns_column), + index=self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent, + name=self._parent.name, + ) + def isempty(self) -> SeriesOrIndex: """ Check whether each string is an empty string. diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 74d602c2cf1..2a43adf5a5c 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -3423,3 +3423,64 @@ def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep ) assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "patterns, expected", + [ + ( + lambda: ["a", "s", "g", "i", "o", "r"], + [ + [-1, 0, 5, 3, -1, 2], + [-1, -1, -1, -1, 1, -1], + [2, 0, -1, -1, -1, 3], + [-1, -1, -1, 0, -1, -1], + ], + ), + ( + lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]), + [ + [-1, 0, 5, -1, -1, 2, -1], + [-1, -1, -1, -1, 1, -1, -1], + [2, -1, -1, -1, -1, 3, 0], + [-1, -1, -1, -1, -1, -1, -1], + ], + ), + ], +) +def test_str_find_multiple(patterns, expected): + s = cudf.Series(["strings", "to", "search", "in"]) + t = patterns() + + expected = cudf.Series(expected) + + # We convert to pandas because find_multiple returns ListDtype(int32) + # and expected is ListDtype(int64). + # Currently there is no easy way to type-cast these to match. + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + s = cudf.Index(s) + t = cudf.Index(t) + + expected.index = s + + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + +def test_str_find_multiple_error(): + s = cudf.Series(["strings", "to", "search", "in"]) + with pytest.raises( + TypeError, + match=re.escape( + "patterns should be an array-like or a Series object, found " + "" + ), + ): + s.str.find_multiple("a") + + t = cudf.Series([1, 2, 3]) + with pytest.raises( + TypeError, + match=re.escape("patterns can only be of 'string' dtype, got: int64"), + ): + s.str.find_multiple(t)