Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add python/cython bindings for str.join API #8085

Merged
merged 5 commits into from
Apr 29, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table_view cimport table_view
Expand All @@ -17,3 +17,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
column_view source_strings,
string_scalar separator,
string_scalar narep) except +

cdef unique_ptr[column] concatenate_list_elements(
column_view lists_strings_column,
column_view separators,
string_scalar separator_narep,
string_scalar string_narep) except +

cdef unique_ptr[column] concatenate_list_elements(
column_view lists_strings_column,
string_scalar separator,
string_scalar narep) except +
76 changes: 74 additions & 2 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand All @@ -15,7 +15,8 @@ from cudf._lib.table cimport Table

from cudf._lib.cpp.strings.combine cimport (
concatenate as cpp_concatenate,
join_strings as cpp_join_strings
join_strings as cpp_join_strings,
concatenate_list_elements as cpp_concatenate_list_elements
)


Expand Down Expand Up @@ -78,3 +79,74 @@ def join(Column source_strings,
))

return Column.from_unique_ptr(move(c_result))


def join_lists_with_scalar(
Column source_strings,
object py_separator,
object py_narep):
"""
Returns a Column by concatenating Lists of strings row-wise
in `source_strings` with the specified `py_separator`
between each string in lists and `<NA>`/`None` values
are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
)

with nogil:
c_result = move(cpp_concatenate_list_elements(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))


def join_lists_with_column(
Column source_strings,
Column separator_strings,
object py_source_narep,
object py_separator_narep):
"""
Returns a Column by concatenating Lists of strings row-wise in
`source_strings` with a corresponding separator at the same
position in `separator_strings` and `<NA>`/`None` values in
`source_strings` are replaced by `py_source_narep` and
`<NA>`/`None` values in `separator_strings` are replaced
by `py_separator_narep`
"""

cdef DeviceScalar source_narep = py_source_narep.device_value
cdef DeviceScalar separator_narep = py_separator_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view separator_view = separator_strings.view()

cdef const string_scalar* scalar_source_narep = \
<const string_scalar*>(source_narep.get_raw_ptr())
cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
separator_narep.get_raw_ptr()
)

with nogil:
c_result = move(cpp_concatenate_list_elements(
source_view,
separator_view,
scalar_separator_narep[0],
scalar_source_narep[0]
))

return Column.from_unique_ptr(move(c_result))
202 changes: 188 additions & 14 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
from cudf._lib.strings.combine import (
concatenate as cpp_concatenate,
join as cpp_join,
join_lists_with_column as cpp_join_lists_with_column,
join_lists_with_scalar as cpp_join_lists_with_scalar,
)
from cudf._lib.strings.contains import (
contains_re as cpp_contains_re,
Expand Down Expand Up @@ -464,17 +466,189 @@ def cat(self, others=None, sep=None, na_rep=None):
out = out[0]
return out

def join(self, sep) -> ParentType:
def join(
self, sep=None, string_na_rep=None, sep_na_rep=None
) -> ParentType:
"""
Join lists contained as elements in the Series/Index with passed
delimiter.

Raises : NotImplementedError
Columns of arrays / lists are not yet supported.
"""
raise NotImplementedError(
"Columns of arrays / lists are not yet " "supported"
If the elements of a Series are lists themselves, join the content of
these lists using the delimiter passed to the function.
This function is an equivalent to :meth:`str.join`.

Parameters
----------
sep : str or array-like
If str, the delimiter is used between list entries.
If array-like, the string at a position is used as a
delimiter for corresponding row of the list entries.
string_na_rep : str, default None
This character will take the place of any null strings
(not empty strings) in the Series.
If ``string_na_rep`` is ``None``, it defaults to empty
space "".
sep_na_rep : str, default None
This character will take the place of any null strings
(not empty strings) in `sep`. This parameter can be used
only if `sep` is array-like. If ``sep_na_rep`` is ``None``,
it defaults to empty space "".

Returns
-------
Series/Index: object
The list entries concatenated by intervening occurrences of
the delimiter.

Raises
------
ValueError
- If ``sep_na_rep`` is supplied when ``sep`` is str.
- If ``sep`` is array-like and not of equal length with Series/Index.
TypeError
- If ``string_na_rep`` or ``sep_na_rep`` are not scalar values.
- If ``sep`` is not of following types: str or array-like.

Examples
--------
>>> import cudf
>>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']])
>>> ser
0 [a, b, c]
1 [d, e]
2 [f]
3 [g, , h]
dtype: list
>>> ser.str.join(sep='-')
0 a-b-c
1 d-e
2 f
3 g- -h
dtype: object

``sep`` can an array-like input:

>>> ser.str.join(sep=['-', '+', '.', '='])
0 a-b-c
1 d+e
2 f
3 g= =h
dtype: object

If the actual series doesn't have lists, each character is joined
by `sep`:

>>> ser = cudf.Series(['abc', 'def', 'ghi'])
>>> ser
0 abc
1 def
2 ghi
dtype: object
>>> ser.str.join(sep='_')
0 a_b_c
1 d_e_f
2 g_h_i
dtype: object

We can replace `<NA>`/`None` values present in lists using
``string_na_rep``:

>>> ser = cudf.Series([['a', 'b', None], None, ['c', 'd']])
>>> ser
0 [a, b, None]
1 None
2 [c, d]
dtype: list
>>> ser.str.join(sep='_', string_na_rep='k')
0 a_b_k
1 <NA>
2 c_d
dtype: object

We can replace `<NA>`/`None` values present in lists of ``sep``
using ``sep_na_rep``:

>>> ser.str.join(sep=[None, '.', '-'], sep_na_rep='+')
0 a+b+
1 <NA>
2 c-d
dtype: object
""" # noqa E501
if sep is None:
sep = ""

if string_na_rep is None:
string_na_rep = ""

if is_scalar(sep) and sep_na_rep:
raise ValueError(
"sep_na_rep cannot be defined when `sep` is scalar."
)

if sep_na_rep is None:
sep_na_rep = ""

if not is_scalar(string_na_rep):
raise TypeError(
f"string_na_rep should be a string scalar, got {string_na_rep}"
f" of type : {type(string_na_rep)}"
)

if isinstance(self._column, cudf.core.column.ListColumn):
strings_column = self._column
else:
# If self._column is not a ListColumn, we will have to
# split each row by character and create a ListColumn out of it.
strings_column = self._split_by_character()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really expensive both computation and memory wise. We may want to raise an issue for a future optimization to prevent us from having to materialize the offsets here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened a FEA: #8094 and added a todo here.


if is_scalar(sep):
data = cpp_join_lists_with_scalar(
strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep)
)
elif can_convert_to_column(sep):
sep_column = column.as_column(sep)
if len(sep_column) != len(strings_column):
raise ValueError(
f"sep should be of similar size to the series, "
f"got: {len(sep_column)}, expected: {len(strings_column)}"
)
if not is_scalar(sep_na_rep):
raise TypeError(
f"sep_na_rep should be a string scalar, got {sep_na_rep} "
f"of type: {type(sep_na_rep)}"
)

data = cpp_join_lists_with_column(
strings_column,
sep_column,
cudf.Scalar(string_na_rep),
cudf.Scalar(sep_na_rep),
)
else:
raise TypeError(
f"sep should be an str, array-like or Series object, "
f"found {type(sep)}"
)

return self._return_or_inplace(data)

def _split_by_character(self):
result_col = cpp_character_tokenize(self._column)

bytes_count = cpp_count_bytes(self._column)
offset_col = cudf.core.column.as_column([0], dtype="int32")
offset_col = offset_col.append(bytes_count)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
offset_col = offset_col._apply_scan_op("sum")

res = cudf.core.column.ListColumn(
size=len(self._column),
dtype=cudf.ListDtype(self._column.dtype),
mask=self._column.mask,
offset=0,
null_count=self._column.null_count,
children=(offset_col, result_col),
)
return res

def extract(
self, pat: str, flags: int = 0, expand: bool = True
Expand Down Expand Up @@ -510,7 +684,7 @@ def extract(
--------
>>> import cudf
>>> s = cudf.Series(['a1', 'b2', 'c3'])
>>> s.str.extract(r'([ab])(\d)') # noqa W605
>>> s.str.extract(r'([ab])(\d)')
0 1
0 a 1
1 b 2
Expand All @@ -519,20 +693,20 @@ def extract(
A pattern with one group will return a DataFrame with one
column if expand=True.

>>> s.str.extract(r'[ab](\d)', expand=True) # noqa W605
>>> s.str.extract(r'[ab](\d)', expand=True)
0
0 1
1 2
2 <NA>

A pattern with one group will return a Series if expand=False.

>>> s.str.extract(r'[ab](\d)', expand=False) # noqa W605
>>> s.str.extract(r'[ab](\d)', expand=False)
0 1
1 2
2 <NA>
dtype: object
"""
""" # noqa W605
if flags != 0:
raise NotImplementedError("`flags` parameter is not yet supported")

Expand Down Expand Up @@ -620,7 +794,7 @@ def contains(

Returning any digit using regular expression.

>>> s1.str.contains('\d', regex=True) # noqa W605
>>> s1.str.contains('\d', regex=True)
0 False
1 False
2 False
Expand Down Expand Up @@ -653,7 +827,7 @@ def contains(
3 True
4 <NA>
dtype: bool
"""
""" # noqa W605
if case is not True:
raise NotImplementedError("`case` parameter is not yet supported")
elif flags != 0:
Expand Down Expand Up @@ -3074,7 +3248,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
Escape ``'$'`` to find the literal dollar sign.

>>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
>>> s.str.count('\$') # noqa W605
>>> s.str.count('\$')
0 1
1 0
2 1
Expand All @@ -3088,7 +3262,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
>>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
>>> index.str.count('a')
Int64Index([0, 0, 2, 1], dtype='int64')
"""
""" # noqa W605
if flags != 0:
raise NotImplementedError("`flags` parameter is not yet supported")

Expand Down
Loading