Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Series string repeat #8882

Merged
merged 9 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions python/cudf/cudf/_lib/cpp/strings/repeat.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.types cimport size_type


cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
nogil:

cdef unique_ptr[column] repeat_strings(
column_view strings,
size_type repeat) except +

cdef unique_ptr[column] repeat_strings(
column_view strings,
column_view repeats) except +
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from cudf._lib.strings.findall import findall
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
insert,
replace,
Expand Down
49 changes: 49 additions & 0 deletions python/cudf/cudf/_lib/strings/repeat.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.strings cimport repeat as cpp_repeat
from cudf._lib.cpp.types cimport size_type


def repeat_scalar(Column source_strings,
size_type repeats):
"""
Returns a Column after repeating
each string in `source_strings`
`repeats` number of times.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_repeat.repeat_strings(
source_view,
repeats
))

return Column.from_unique_ptr(move(c_result))


def repeat_sequence(Column source_strings,
Column repeats):
"""
Returns a Column after repeating
each string in `source_strings`
`repeats` number of times.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view repeats_view = repeats.view()

with nogil:
c_result = move(cpp_repeat.repeat_strings(
source_view,
repeats_view
))

return Column.from_unique_ptr(move(c_result))
59 changes: 59 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,65 @@ def contains(
)
return self._return_or_inplace(result_col)

def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex:
"""
Duplicate each string in the Series or Index.
Equivalent to `str.repeat()
<https://pandas.pydata.org/docs/reference/api/pandas.Series.str.repeat.html>`_.

Parameters
----------
repeats : int or sequence of int
Same value for all (int) or different value per (sequence).

Returns
-------
Series or Index of object
Series or Index of repeated string objects specified by
input parameter repeats.

Examples
--------
>>> s = cudf.Series(['a', 'b', 'c'])
>>> s
0 a
1 b
2 c
dtype: object

Single int repeats string in Series

>>> s.str.repeat(repeats=2)
0 aa
1 bb
2 cc
dtype: object

Sequence of int repeats corresponding string in Series

>>> s.str.repeat(repeats=[1, 2, 3])
0 a
1 bb
2 ccc
dtype: object
"""
none_flag = False
if repeats is None:
none_flag = True
repeats = [None] * len(self._column)

if can_convert_to_column(repeats):
result = self._return_or_inplace(
libstrings.repeat_sequence(
self._column, column.as_column(repeats, dtype="int"),
),
)
return result.astype("float64") if none_flag else result

return self._return_or_inplace(
libstrings.repeat_scalar(self._column, repeats)
)

def replace(
self,
pat: Union[str, Sequence],
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,33 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data", [["hello", "world", None, "", "!"]],
)
@pytest.mark.parametrize(
"repeats",
[
2,
0,
-3,
None,
[5, 4, 3, 2, 6],
[5, None, 3, 2, 6],
[0, 0, 0, 0, 0],
[-1, -2, -3, -4, -5],
[None, None, None, None, None],
],
)
def test_string_repeat(data, repeats):
ps = pd.Series(data)
gs = cudf.from_pandas(ps)

expect = ps.str.repeat(repeats)
got = gs.str.repeat(repeats)

assert_eq(expect, got)


# Pandas isn't respect the `n` parameter so ignoring it in test parameters
@pytest.mark.parametrize(
"pat,regex",
Expand Down