Skip to content

Commit

Permalink
Add python/cython bindings for str.join API (#8085)
Browse files Browse the repository at this point in the history
Resolves #8079 

This PR:

- [x] Introduces bindings for `concatenate_list_elements` in cython and plumbs it to our python API, `.str.join`
- [x] Enabled and adds more test coverage for `str.join`.
- [x] Docstring addition and misc docs cleanup.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: #8085
  • Loading branch information
galipremsagar authored Apr 29, 2021
1 parent 7f0ad1d commit ac25e97
Show file tree
Hide file tree
Showing 4 changed files with 401 additions and 19 deletions.
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table_view cimport table_view
Expand All @@ -17,3 +17,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
column_view source_strings,
string_scalar separator,
string_scalar narep) except +

cdef unique_ptr[column] concatenate_list_elements(
column_view lists_strings_column,
column_view separators,
string_scalar separator_narep,
string_scalar string_narep) except +

cdef unique_ptr[column] concatenate_list_elements(
column_view lists_strings_column,
string_scalar separator,
string_scalar narep) except +
76 changes: 74 additions & 2 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand All @@ -15,7 +15,8 @@ from cudf._lib.table cimport Table

from cudf._lib.cpp.strings.combine cimport (
concatenate as cpp_concatenate,
join_strings as cpp_join_strings
join_strings as cpp_join_strings,
concatenate_list_elements as cpp_concatenate_list_elements
)


Expand Down Expand Up @@ -78,3 +79,74 @@ def join(Column source_strings,
))

return Column.from_unique_ptr(move(c_result))


def join_lists_with_scalar(
Column source_strings,
object py_separator,
object py_narep):
"""
Returns a Column by concatenating Lists of strings row-wise
in `source_strings` with the specified `py_separator`
between each string in lists and `<NA>`/`None` values
are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
)

with nogil:
c_result = move(cpp_concatenate_list_elements(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))


def join_lists_with_column(
Column source_strings,
Column separator_strings,
object py_source_narep,
object py_separator_narep):
"""
Returns a Column by concatenating Lists of strings row-wise in
`source_strings` with a corresponding separator at the same
position in `separator_strings` and `<NA>`/`None` values in
`source_strings` are replaced by `py_source_narep` and
`<NA>`/`None` values in `separator_strings` are replaced
by `py_separator_narep`
"""

cdef DeviceScalar source_narep = py_source_narep.device_value
cdef DeviceScalar separator_narep = py_separator_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view separator_view = separator_strings.view()

cdef const string_scalar* scalar_source_narep = \
<const string_scalar*>(source_narep.get_raw_ptr())
cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
separator_narep.get_raw_ptr()
)

with nogil:
c_result = move(cpp_concatenate_list_elements(
source_view,
separator_view,
scalar_separator_narep[0],
scalar_source_narep[0]
))

return Column.from_unique_ptr(move(c_result))
209 changes: 195 additions & 14 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
from cudf._lib.strings.combine import (
concatenate as cpp_concatenate,
join as cpp_join,
join_lists_with_column as cpp_join_lists_with_column,
join_lists_with_scalar as cpp_join_lists_with_scalar,
)
from cudf._lib.strings.contains import (
contains_re as cpp_contains_re,
Expand Down Expand Up @@ -465,17 +467,196 @@ def cat(self, others=None, sep=None, na_rep=None):
out = out[0]
return out

def join(self, sep) -> ParentType:
def join(
self, sep=None, string_na_rep=None, sep_na_rep=None
) -> ParentType:
"""
Join lists contained as elements in the Series/Index with passed
delimiter.
Raises : NotImplementedError
Columns of arrays / lists are not yet supported.
"""
raise NotImplementedError(
"Columns of arrays / lists are not yet " "supported"
If the elements of a Series are lists themselves, join the content of
these lists using the delimiter passed to the function.
This function is an equivalent to :meth:`str.join`.
Parameters
----------
sep : str or array-like
If str, the delimiter is used between list entries.
If array-like, the string at a position is used as a
delimiter for corresponding row of the list entries.
string_na_rep : str, default None
This character will take the place of any null strings
(not empty strings) in the Series.
If ``string_na_rep`` is ``None``, it defaults to empty
space "".
sep_na_rep : str, default None
This character will take the place of any null strings
(not empty strings) in `sep`. This parameter can be used
only if `sep` is array-like. If ``sep_na_rep`` is ``None``,
it defaults to empty space "".
Returns
-------
Series/Index: object
The list entries concatenated by intervening occurrences of
the delimiter.
Raises
------
ValueError
- If ``sep_na_rep`` is supplied when ``sep`` is str.
- If ``sep`` is array-like and not of equal length with Series/Index.
TypeError
- If ``string_na_rep`` or ``sep_na_rep`` are not scalar values.
- If ``sep`` is not of following types: str or array-like.
Examples
--------
>>> import cudf
>>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']])
>>> ser
0 [a, b, c]
1 [d, e]
2 [f]
3 [g, , h]
dtype: list
>>> ser.str.join(sep='-')
0 a-b-c
1 d-e
2 f
3 g- -h
dtype: object
``sep`` can an array-like input:
>>> ser.str.join(sep=['-', '+', '.', '='])
0 a-b-c
1 d+e
2 f
3 g= =h
dtype: object
If the actual series doesn't have lists, each character is joined
by `sep`:
>>> ser = cudf.Series(['abc', 'def', 'ghi'])
>>> ser
0 abc
1 def
2 ghi
dtype: object
>>> ser.str.join(sep='_')
0 a_b_c
1 d_e_f
2 g_h_i
dtype: object
We can replace `<NA>`/`None` values present in lists using
``string_na_rep``:
>>> ser = cudf.Series([['a', 'b', None], None, ['c', 'd']])
>>> ser
0 [a, b, None]
1 None
2 [c, d]
dtype: list
>>> ser.str.join(sep='_', string_na_rep='k')
0 a_b_k
1 <NA>
2 c_d
dtype: object
We can replace `<NA>`/`None` values present in lists of ``sep``
using ``sep_na_rep``:
>>> ser.str.join(sep=[None, '.', '-'], sep_na_rep='+')
0 a+b+
1 <NA>
2 c-d
dtype: object
""" # noqa E501
if sep is None:
sep = ""

if string_na_rep is None:
string_na_rep = ""

if is_scalar(sep) and sep_na_rep:
raise ValueError(
"sep_na_rep cannot be defined when `sep` is scalar."
)

if sep_na_rep is None:
sep_na_rep = ""

if not is_scalar(string_na_rep):
raise TypeError(
f"string_na_rep should be a string scalar, got {string_na_rep}"
f" of type : {type(string_na_rep)}"
)

if isinstance(self._column, cudf.core.column.ListColumn):
strings_column = self._column
else:
# If self._column is not a ListColumn, we will have to
# split each row by character and create a ListColumn out of it.

# TODO: Remove this workaround after the following
# feature request is resolved
# FEA: https://github.com/rapidsai/cudf/issues/8094
strings_column = self._split_by_character()

if is_scalar(sep):
data = cpp_join_lists_with_scalar(
strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep)
)
elif can_convert_to_column(sep):
sep_column = column.as_column(sep)
if len(sep_column) != len(strings_column):
raise ValueError(
f"sep should be of similar size to the series, "
f"got: {len(sep_column)}, expected: {len(strings_column)}"
)
if not is_scalar(sep_na_rep):
raise TypeError(
f"sep_na_rep should be a string scalar, got {sep_na_rep} "
f"of type: {type(sep_na_rep)}"
)

data = cpp_join_lists_with_column(
strings_column,
sep_column,
cudf.Scalar(string_na_rep),
cudf.Scalar(sep_na_rep),
)
else:
raise TypeError(
f"sep should be an str, array-like or Series object, "
f"found {type(sep)}"
)

return self._return_or_inplace(data)

def _split_by_character(self):
result_col = cpp_character_tokenize(self._column)

bytes_count = cpp_count_bytes(self._column)
offset_col = cudf.core.column.column_empty(
row_count=len(bytes_count) + 1, dtype="int32"
)
offset_col[0] = 0
offset_col[1:] = bytes_count
offset_col = offset_col._apply_scan_op("sum")

res = cudf.core.column.ListColumn(
size=len(self._column),
dtype=cudf.ListDtype(self._column.dtype),
mask=self._column.mask,
offset=0,
null_count=self._column.null_count,
children=(offset_col, result_col),
)
return res

def extract(
self, pat: str, flags: int = 0, expand: bool = True
Expand Down Expand Up @@ -511,7 +692,7 @@ def extract(
--------
>>> import cudf
>>> s = cudf.Series(['a1', 'b2', 'c3'])
>>> s.str.extract(r'([ab])(\d)') # noqa W605
>>> s.str.extract(r'([ab])(\d)')
0 1
0 a 1
1 b 2
Expand All @@ -520,20 +701,20 @@ def extract(
A pattern with one group will return a DataFrame with one
column if expand=True.
>>> s.str.extract(r'[ab](\d)', expand=True) # noqa W605
>>> s.str.extract(r'[ab](\d)', expand=True)
0
0 1
1 2
2 <NA>
A pattern with one group will return a Series if expand=False.
>>> s.str.extract(r'[ab](\d)', expand=False) # noqa W605
>>> s.str.extract(r'[ab](\d)', expand=False)
0 1
1 2
2 <NA>
dtype: object
"""
""" # noqa W605
if flags != 0:
raise NotImplementedError("`flags` parameter is not yet supported")

Expand Down Expand Up @@ -621,7 +802,7 @@ def contains(
Returning any digit using regular expression.
>>> s1.str.contains('\d', regex=True) # noqa W605
>>> s1.str.contains('\d', regex=True)
0 False
1 False
2 False
Expand Down Expand Up @@ -654,7 +835,7 @@ def contains(
3 True
4 <NA>
dtype: bool
"""
""" # noqa W605
if case is not True:
raise NotImplementedError("`case` parameter is not yet supported")
elif flags != 0:
Expand Down Expand Up @@ -3075,7 +3256,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
Escape ``'$'`` to find the literal dollar sign.
>>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
>>> s.str.count('\$') # noqa W605
>>> s.str.count('\$')
0 1
1 0
2 1
Expand All @@ -3089,7 +3270,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
>>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
>>> index.str.count('a')
Int64Index([0, 0, 2, 1], dtype='int64')
"""
""" # noqa W605
if flags != 0:
raise NotImplementedError("`flags` parameter is not yet supported")

Expand Down
Loading

0 comments on commit ac25e97

Please sign in to comment.