Add python/cython bindings for str.join API (#8085)

Resolves #8079 This PR: - [x] Introduces bindings for `concatenate_list_elements` in cython and plumbs it to our python API, `.str.join` - [x] Enabled and adds more test coverage for `str.join`. - [x] Docstring addition and misc docs cleanup. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: #8085
rapidsai · Apr 29, 2021 · ac25e97 · ac25e97
1 parent 7f0ad1d
commit ac25e97
Show file tree

Hide file tree

Showing 4 changed files with 401 additions and 19 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -17,3 +17,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         column_view source_strings,
         string_scalar separator,
         string_scalar narep) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        column_view lists_strings_column,
+        column_view separators,
+        string_scalar separator_narep,
+        string_scalar string_narep) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        column_view lists_strings_column,
+        string_scalar separator,
+        string_scalar narep) except +
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,7 +15,8 @@ from cudf._lib.table cimport Table
 
 from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
-    join_strings as cpp_join_strings
+    join_strings as cpp_join_strings,
+    concatenate_list_elements as cpp_concatenate_list_elements
 )
 
 
@@ -78,3 +79,74 @@ def join(Column source_strings,
         ))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_scalar(
+        Column source_strings,
+        object py_separator,
+        object py_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise
+    in `source_strings` with the specified `py_separator`
+    between each string in lists and `<NA>`/`None` values
+    are replaced by `py_narep`
+    """
+
+    cdef DeviceScalar separator = py_separator.device_value
+    cdef DeviceScalar narep = py_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    cdef const string_scalar* scalar_separator = \
+        <const string_scalar*>(separator.get_raw_ptr())
+    cdef const string_scalar* scalar_narep = <const string_scalar*>(
+        narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            source_view,
+            scalar_separator[0],
+            scalar_narep[0]
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_column(
+        Column source_strings,
+        Column separator_strings,
+        object py_source_narep,
+        object py_separator_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise in
+    `source_strings` with a corresponding separator at the same
+    position in `separator_strings` and `<NA>`/`None` values in
+    `source_strings` are replaced by `py_source_narep` and
+    `<NA>`/`None` values in `separator_strings` are replaced
+    by `py_separator_narep`
+    """
+
+    cdef DeviceScalar source_narep = py_source_narep.device_value
+    cdef DeviceScalar separator_narep = py_separator_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef column_view separator_view = separator_strings.view()
+
+    cdef const string_scalar* scalar_source_narep = \
+        <const string_scalar*>(source_narep.get_raw_ptr())
+    cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
+        separator_narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            source_view,
+            separator_view,
+            scalar_separator_narep[0],
+            scalar_source_narep[0]
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -79,6 +79,8 @@
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
+    join_lists_with_column as cpp_join_lists_with_column,
+    join_lists_with_scalar as cpp_join_lists_with_scalar,
 )
 from cudf._lib.strings.contains import (
     contains_re as cpp_contains_re,
@@ -465,17 +467,196 @@ def cat(self, others=None, sep=None, na_rep=None):
                 out = out[0]
         return out
 
-    def join(self, sep) -> ParentType:
+    def join(
+        self, sep=None, string_na_rep=None, sep_na_rep=None
+    ) -> ParentType:
         """
         Join lists contained as elements in the Series/Index with passed
         delimiter.
 
-        Raises : NotImplementedError
-            Columns of arrays / lists are not yet supported.
-        """
-        raise NotImplementedError(
-            "Columns of arrays / lists are not yet " "supported"
+        If the elements of a Series are lists themselves, join the content of
+        these lists using the delimiter passed to the function.
+        This function is an equivalent to :meth:`str.join`.
+
+        Parameters
+        ----------
+        sep : str or array-like
+            If str, the delimiter is used between list entries.
+            If array-like, the string at a position is used as a
+            delimiter for corresponding row of the list entries.
+        string_na_rep : str, default None
+            This character will take the place of any null strings
+            (not empty strings) in the Series.
+            If ``string_na_rep`` is ``None``, it defaults to empty
+            space "".
+        sep_na_rep : str, default None
+            This character will take the place of any null strings
+            (not empty strings) in `sep`. This parameter can be used
+            only if `sep` is array-like. If ``sep_na_rep`` is ``None``,
+            it defaults to empty space "".
+
+        Returns
+        -------
+        Series/Index: object
+            The list entries concatenated by intervening occurrences of
+            the delimiter.
+
+        Raises
+        ------
+        ValueError
+            - If ``sep_na_rep`` is supplied when ``sep`` is str.
+            - If ``sep`` is array-like and not of equal length with Series/Index.
+        TypeError
+            - If ``string_na_rep`` or ``sep_na_rep`` are not scalar values.
+            - If ``sep`` is not of following types: str or array-like.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']])
+        >>> ser
+        0    [a, b, c]
+        1       [d, e]
+        2          [f]
+        3    [g,  , h]
+        dtype: list
+        >>> ser.str.join(sep='-')
+        0    a-b-c
+        1      d-e
+        2        f
+        3    g- -h
+        dtype: object
+
+        ``sep`` can an array-like input:
+
+        >>> ser.str.join(sep=['-', '+', '.', '='])
+        0    a-b-c
+        1      d+e
+        2        f
+        3    g= =h
+        dtype: object
+
+        If the actual series doesn't have lists, each character is joined
+        by `sep`:
+
+        >>> ser = cudf.Series(['abc', 'def', 'ghi'])
+        >>> ser
+        0    abc
+        1    def
+        2    ghi
+        dtype: object
+        >>> ser.str.join(sep='_')
+        0    a_b_c
+        1    d_e_f
+        2    g_h_i
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists using
+        ``string_na_rep``:
+
+        >>> ser = cudf.Series([['a', 'b', None], None, ['c', 'd']])
+        >>> ser
+        0    [a, b, None]
+        1            None
+        2          [c, d]
+        dtype: list
+        >>> ser.str.join(sep='_', string_na_rep='k')
+        0    a_b_k
+        1     <NA>
+        2      c_d
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists of ``sep``
+        using ``sep_na_rep``:
+
+        >>> ser.str.join(sep=[None, '.', '-'], sep_na_rep='+')
+        0    a+b+
+        1    <NA>
+        2     c-d
+        dtype: object
+        """  # noqa E501
+        if sep is None:
+            sep = ""
+
+        if string_na_rep is None:
+            string_na_rep = ""
+
+        if is_scalar(sep) and sep_na_rep:
+            raise ValueError(
+                "sep_na_rep cannot be defined when `sep` is scalar."
+            )
+
+        if sep_na_rep is None:
+            sep_na_rep = ""
+
+        if not is_scalar(string_na_rep):
+            raise TypeError(
+                f"string_na_rep should be a string scalar, got {string_na_rep}"
+                f" of type : {type(string_na_rep)}"
+            )
+
+        if isinstance(self._column, cudf.core.column.ListColumn):
+            strings_column = self._column
+        else:
+            # If self._column is not a ListColumn, we will have to
+            # split each row by character and create a ListColumn out of it.
+
+            # TODO: Remove this workaround after the following
+            # feature request is resolved
+            # FEA: https://github.com/rapidsai/cudf/issues/8094
+            strings_column = self._split_by_character()
+
+        if is_scalar(sep):
+            data = cpp_join_lists_with_scalar(
+                strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep)
+            )
+        elif can_convert_to_column(sep):
+            sep_column = column.as_column(sep)
+            if len(sep_column) != len(strings_column):
+                raise ValueError(
+                    f"sep should be of similar size to the series, "
+                    f"got: {len(sep_column)}, expected: {len(strings_column)}"
+                )
+            if not is_scalar(sep_na_rep):
+                raise TypeError(
+                    f"sep_na_rep should be a string scalar, got {sep_na_rep} "
+                    f"of type: {type(sep_na_rep)}"
+                )
+
+            data = cpp_join_lists_with_column(
+                strings_column,
+                sep_column,
+                cudf.Scalar(string_na_rep),
+                cudf.Scalar(sep_na_rep),
+            )
+        else:
+            raise TypeError(
+                f"sep should be an str, array-like or Series object, "
+                f"found {type(sep)}"
+            )
+
+        return self._return_or_inplace(data)
+
+    def _split_by_character(self):
+        result_col = cpp_character_tokenize(self._column)
+
+        bytes_count = cpp_count_bytes(self._column)
+        offset_col = cudf.core.column.column_empty(
+            row_count=len(bytes_count) + 1, dtype="int32"
         )
+        offset_col[0] = 0
+        offset_col[1:] = bytes_count
+        offset_col = offset_col._apply_scan_op("sum")
+
+        res = cudf.core.column.ListColumn(
+            size=len(self._column),
+            dtype=cudf.ListDtype(self._column.dtype),
+            mask=self._column.mask,
+            offset=0,
+            null_count=self._column.null_count,
+            children=(offset_col, result_col),
+        )
+        return res
 
     def extract(
         self, pat: str, flags: int = 0, expand: bool = True
@@ -511,7 +692,7 @@ def extract(
         --------
         >>> import cudf
         >>> s = cudf.Series(['a1', 'b2', 'c3'])
-        >>> s.str.extract(r'([ab])(\d)')                                # noqa W605
+        >>> s.str.extract(r'([ab])(\d)')
               0     1
         0     a     1
         1     b     2
@@ -520,20 +701,20 @@ def extract(
         A pattern with one group will return a DataFrame with one
         column if expand=True.
 
-        >>> s.str.extract(r'[ab](\d)', expand=True)                     # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=True)
               0
         0     1
         1     2
         2  <NA>
 
         A pattern with one group will return a Series if expand=False.
 
-        >>> s.str.extract(r'[ab](\d)', expand=False)                    # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=False)
         0       1
         1       2
         2    <NA>
         dtype: object
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
@@ -621,7 +802,7 @@ def contains(
 
         Returning any digit using regular expression.
 
-        >>> s1.str.contains('\d', regex=True)                               # noqa W605
+        >>> s1.str.contains('\d', regex=True)
         0    False
         1    False
         2    False
@@ -654,7 +835,7 @@ def contains(
         3     True
         4     <NA>
         dtype: bool
-        """
+        """  # noqa W605
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
         elif flags != 0:
@@ -3075,7 +3256,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         Escape ``'$'`` to find the literal dollar sign.
 
         >>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
-        >>> s.str.count('\$')                                       # noqa W605
+        >>> s.str.count('\$')
         0    1
         1    0
         2    1
@@ -3089,7 +3270,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")