Skip to content

Commit

Permalink
Add .str.find_multiple API (#11928)
Browse files Browse the repository at this point in the history
Resolves: #10126

This PR adds `.str.find_multiple` API.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: #11928
  • Loading branch information
galipremsagar authored Oct 18, 2022
1 parent afa16b4 commit a926c52
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/string_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ strings and apply several methods to it. These can be accessed like
filter_tokens
find
findall
find_multiple
get
get_json_object
hex_to_int
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
startswith,
startswith_multiple,
)
from cudf._lib.strings.find_multiple import find_multiple
from cudf._lib.strings.findall import findall
from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
from cudf._lib.strings.padding import (
Expand Down
64 changes: 64 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3623,6 +3623,70 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
data = libstrings.findall(self._column, pat, flags)
return self._return_or_inplace(data)

def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
"""
Find all first occurrences of patterns in the Series/Index.
Parameters
----------
patterns : array-like, Sequence or Series
Patterns to search for in the given Series/Index.
Returns
-------
Series
A Series with a list of indices of each pattern's first occurrence.
If a pattern is not found, -1 is returned for that index.
Examples
--------
>>> import cudf
>>> s = cudf.Series(["strings", "to", "search", "in"])
>>> s
0 strings
1 to
2 search
3 in
dtype: object
>>> t = cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"])
>>> t
0 a
1 string
2 g
3 inn
4 o
5 r
6 sea
dtype: object
>>> s.str.find_multiple(t)
0 [-1, 0, 5, -1, -1, 2, -1]
1 [-1, -1, -1, -1, 1, -1, -1]
2 [2, -1, -1, -1, -1, 3, 0]
3 [-1, -1, -1, -1, -1, -1, -1]
dtype: list
"""
if can_convert_to_column(patterns):
patterns_column = column.as_column(patterns)
else:
raise TypeError(
"patterns should be an array-like or a Series object, "
f"found {type(patterns)}"
)

if not isinstance(patterns_column, StringColumn):
raise TypeError(
"patterns can only be of 'string' dtype, "
f"got: {patterns_column.dtype}"
)

return cudf.Series(
libstrings.find_multiple(self._column, patterns_column),
index=self._parent.index
if isinstance(self._parent, cudf.Series)
else self._parent,
name=self._parent.name,
)

def isempty(self) -> SeriesOrIndex:
"""
Check whether each string is an empty string.
Expand Down
61 changes: 61 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3423,3 +3423,64 @@ def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected):
sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep
)
assert_eq(actual, expected)


@pytest.mark.parametrize(
"patterns, expected",
[
(
lambda: ["a", "s", "g", "i", "o", "r"],
[
[-1, 0, 5, 3, -1, 2],
[-1, -1, -1, -1, 1, -1],
[2, 0, -1, -1, -1, 3],
[-1, -1, -1, 0, -1, -1],
],
),
(
lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]),
[
[-1, 0, 5, -1, -1, 2, -1],
[-1, -1, -1, -1, 1, -1, -1],
[2, -1, -1, -1, -1, 3, 0],
[-1, -1, -1, -1, -1, -1, -1],
],
),
],
)
def test_str_find_multiple(patterns, expected):
s = cudf.Series(["strings", "to", "search", "in"])
t = patterns()

expected = cudf.Series(expected)

# We convert to pandas because find_multiple returns ListDtype(int32)
# and expected is ListDtype(int64).
# Currently there is no easy way to type-cast these to match.
assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas())

s = cudf.Index(s)
t = cudf.Index(t)

expected.index = s

assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas())


def test_str_find_multiple_error():
s = cudf.Series(["strings", "to", "search", "in"])
with pytest.raises(
TypeError,
match=re.escape(
"patterns should be an array-like or a Series object, found "
"<class 'str'>"
),
):
s.str.find_multiple("a")

t = cudf.Series([1, 2, 3])
with pytest.raises(
TypeError,
match=re.escape("patterns can only be of 'string' dtype, got: int64"),
):
s.str.find_multiple(t)

0 comments on commit a926c52

Please sign in to comment.