From dde5c8b937f2b4a5dc45ef7167d011af2899fdf5 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 27 Apr 2021 14:01:44 -0700
Subject: [PATCH 1/4] enable str.join API

---
 python/cudf/cudf/_lib/cpp/strings/combine.pxd |  13 +-
 python/cudf/cudf/_lib/strings/combine.pyx     |  76 ++++++++++-
 python/cudf/cudf/core/column/string.py        |  82 +++++++++++-
 python/cudf/cudf/tests/test_string.py         | 122 +++++++++++++++++-
 4 files changed, 285 insertions(+), 8 deletions(-)
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
index 2670c67908f..250c6441882 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -17,3 +17,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         column_view source_strings,
         string_scalar separator,
         string_scalar narep) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        column_view lists_strings_column,
+        column_view separators,
+        string_scalar separator_narep,
+        string_scalar string_narep) except +
+
+    cdef unique_ptr[column] concatenate_list_elements(
+        column_view lists_strings_column,
+        string_scalar separator,
+        string_scalar narep) except +
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 04fde5be9e8..25619de3ed0 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,7 +15,8 @@ from cudf._lib.table cimport Table
 
 from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
-    join_strings as cpp_join_strings
+    join_strings as cpp_join_strings,
+    concatenate_list_elements as cpp_concatenate_list_elements
 )
 
 
@@ -78,3 +79,74 @@ def join(Column source_strings,
         ))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_scalar(
+        Column source_strings,
+        object py_separator,
+        object py_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise
+    in `source_strings` with the specified `py_separator`
+    between each string in lists and `<NA>`/`None` values
+    are replaced by `py_narep`
+    """
+
+    cdef DeviceScalar separator = py_separator.device_value
+    cdef DeviceScalar narep = py_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    cdef const string_scalar* scalar_separator = \
+        <const string_scalar*>(separator.get_raw_ptr())
+    cdef const string_scalar* scalar_narep = <const string_scalar*>(
+        narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            source_view,
+            scalar_separator[0],
+            scalar_narep[0]
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_column(
+        Column source_strings,
+        Column separator_strings,
+        object py_source_narep,
+        object py_separator_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise in
+    `source_strings` with a corresponding separator at the same
+    position in `separator_strings` and `<NA>`/`None` values in
+    `source_strings` are replaced by `py_source_narep` and
+    `<NA>`/`None` values in `separator_strings` are replaced
+    by `py_separator_narep`
+    """
+
+    cdef DeviceScalar source_narep = py_source_narep.device_value
+    cdef DeviceScalar separator_narep = py_separator_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef column_view separator_view = separator_strings.view()
+
+    cdef const string_scalar* scalar_source_narep = \
+        <const string_scalar*>(source_narep.get_raw_ptr())
+    cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
+        separator_narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            source_view,
+            separator_view,
+            scalar_separator_narep[0],
+            scalar_source_narep[0]
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index de2df9b50d7..bd94b93f985 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -78,6 +78,8 @@
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
+    join_lists_with_column as cpp_join_lists_with_column,
+    join_lists_with_scalar as cpp_join_lists_with_scalar,
 )
 from cudf._lib.strings.contains import (
     contains_re as cpp_contains_re,
@@ -464,7 +466,9 @@ def cat(self, others=None, sep=None, na_rep=None):
                 out = out[0]
         return out
 
-    def join(self, sep) -> ParentType:
+    def join(
+        self, sep=None, string_na_rep=None, sep_na_rep=None
+    ) -> ParentType:
         """
         Join lists contained as elements in the Series/Index with passed
         delimiter.
@@ -472,9 +476,81 @@ def join(self, sep) -> ParentType:
         Raises : NotImplementedError
             Columns of arrays / lists are not yet supported.
         """
-        raise NotImplementedError(
-            "Columns of arrays / lists are not yet " "supported"
+        if sep is None:
+            sep = ""
+
+        if string_na_rep is None:
+            string_na_rep = ""
+
+        if is_scalar(sep) and sep_na_rep:
+            raise ValueError(
+                "sep_na_rep cannot be defined when `sep` is scalar."
+            )
+
+        if sep_na_rep is None:
+            sep_na_rep = ""
+
+        if not is_scalar(string_na_rep):
+            raise TypeError(
+                f"string_na_rep should be a string scalar, got {string_na_rep}"
+                f" of type : {type(string_na_rep)}"
+            )
+
+        if isinstance(self._column, cudf.core.column.ListColumn):
+            strings_column = self._column
+        else:
+            # If self._column is not a ListColumn, we will have to
+            # split each row by character and create a ListColumn out of it.
+            strings_column = self._split_by_character()
+
+        if is_scalar(sep):
+            data = cpp_join_lists_with_scalar(
+                strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep)
+            )
+        elif can_convert_to_column(sep):
+            sep_column = column.as_column(sep)
+            if len(sep_column) != len(strings_column):
+                raise ValueError(
+                    f"sep should be of similar size to the series, "
+                    f"got: {len(sep_column)}, expected: {len(strings_column)}"
+                )
+            if not is_scalar(sep_na_rep):
+                raise TypeError(
+                    f"sep_na_rep should be a string scalar, got {sep_na_rep} "
+                    f"of type: {type(sep_na_rep)}"
+                )
+
+            data = cpp_join_lists_with_column(
+                strings_column,
+                sep_column,
+                cudf.Scalar(string_na_rep),
+                cudf.Scalar(sep_na_rep),
+            )
+        else:
+            raise TypeError(
+                f"sep should be an str, array-like or Series object, "
+                f"found {type(sep)}"
+            )
+
+        return self._return_or_inplace(data)
+
+    def _split_by_character(self):
+        result_col = cpp_character_tokenize(self._column)
+
+        bytes_count = cpp_count_bytes(self._column)
+        offset_col = cudf.core.column.as_column([0], dtype="int32")
+        offset_col = offset_col.append(bytes_count)
+        offset_col = offset_col._apply_scan_op("sum")
+
+        res = cudf.core.column.ListColumn(
+            size=len(self._column),
+            dtype=cudf.ListDtype(self._column.dtype),
+            mask=self._column.mask,
+            offset=0,
+            null_count=self._column.null_count,
+            children=(offset_col, result_col),
         )
+        return res
 
     def extract(
         self, pat: str, flags: int = 0, expand: bool = True
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 2ca6bc622be..0ff5b81ce81 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -807,8 +807,7 @@ def test_string_cat_str_error():
         gs.str.cat(gs.str)
 
 
-@pytest.mark.xfail(raises=(NotImplementedError, AttributeError))
-@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"])
+@pytest.mark.parametrize("sep", ["", " ", "|", ",", "|||"])
 def test_string_join(ps_gs, sep):
     ps, gs = ps_gs
 
@@ -2931,3 +2930,122 @@ def test_string_slice_with_mask():
     assert_eq(actual._column.null_count, expected._column.null_count)
 
     assert_eq(actual, expected)
+
+
+def test_str_join_lists_error():
+    sr = cudf.Series([["a", "a"], ["b"], ["c"]])
+
+    with pytest.raises(
+        ValueError, match="sep_na_rep cannot be defined when `sep` is scalar."
+    ):
+        sr.str.join(sep="-", sep_na_rep="-")
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "string_na_rep should be a string scalar, got [10, 20] of type "
+            ": <class 'list'>"
+        ),
+    ):
+        sr.str.join(string_na_rep=[10, 20])
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "sep should be of similar size to the series, got: 2, expected: 3"
+        ),
+    ):
+        sr.str.join(sep=["=", "-"])
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "sep_na_rep should be a string scalar, got "
+            "['na'] of type: <class 'list'>"
+        ),
+    ):
+        sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"])
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "sep should be an str, array-like or Series object, "
+            "found <class 'cudf.core.dataframe.DataFrame'>"
+        ),
+    ):
+        sr.str.join(sep=cudf.DataFrame())
+
+
+@pytest.mark.parametrize(
+    "sr,sep,string_na_rep,sep_na_rep,expected",
+    [
+        (
+            cudf.Series([["a", "a"], ["b"], ["c"]]),
+            "-",
+            None,
+            None,
+            cudf.Series(["a-a", "b", "c"]),
+        ),
+        (
+            cudf.Series([["a", "b"], [None], [None, "hello", None, "world"]]),
+            "__",
+            "=",
+            None,
+            cudf.Series(["a__b", "=", "=__hello__=__world"]),
+        ),
+        (
+            cudf.Series(
+                [
+                    ["a", None, "b"],
+                    [None],
+                    [None, "hello", None, "world"],
+                    None,
+                ]
+            ),
+            ["-", "_", "**", "!"],
+            None,
+            None,
+            cudf.Series(["a--b", "", "**hello****world", None]),
+        ),
+        (
+            cudf.Series(
+                [
+                    ["a", None, "b"],
+                    [None],
+                    [None, "hello", None, "world"],
+                    None,
+                ]
+            ),
+            ["-", "_", "**", None],
+            "rep_str",
+            "sep_str",
+            cudf.Series(
+                [
+                    "a-rep_str-b",
+                    "rep_str",
+                    "rep_str**hello**rep_str**world",
+                    None,
+                ]
+            ),
+        ),
+        (
+            cudf.Series([[None, "a"], [None], None]),
+            ["-", "_", None],
+            "rep_str",
+            None,
+            cudf.Series(["rep_str-a", "rep_str", None]),
+        ),
+        (
+            cudf.Series([[None, "a"], [None], None]),
+            ["-", "_", None],
+            None,
+            "sep_str",
+            cudf.Series(["-a", "", None]),
+        ),
+    ],
+)
+def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected):
+    actual = sr.str.join(
+        sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep
+    )
+    assert_eq(actual, expected)

From 59a488fe40a61f8799eb8573665ed8fea91303e4 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 27 Apr 2021 14:49:28 -0700
Subject: [PATCH 2/4] add docs

---
 python/cudf/cudf/core/column/string.py | 120 ++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index bd94b93f985..f737bff6468 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -473,9 +473,107 @@ def join(
         Join lists contained as elements in the Series/Index with passed
         delimiter.
 
-        Raises : NotImplementedError
-            Columns of arrays / lists are not yet supported.
-        """
+        If the elements of a Series are lists themselves, join the content of
+        these lists using the delimiter passed to the function.
+        This function is an equivalent to :meth:`str.join`.
+
+        Parameters
+        ----------
+        sep : str or array-like
+            If str, the delimiter is used between list entries.
+            If array-like, the string at a position is used as a
+            delimiter for corresponding row of the list entries.
+        string_na_rep : str, default None
+            This character will take the place of any null strings
+            (not empty strings) in the Series.
+            If ``string_na_rep`` is ``None``, it defaults to empty
+            space "".
+        sep_na_rep : str, default None
+            This character will take the place of any null strings
+            (not empty strings) in `sep`. This parameter can be used
+            only if `sep` is array-like. If ``sep_na_rep`` is ``None``,
+            it defaults to empty space "".
+
+        Returns
+        -------
+        Series/Index: object
+            The list entries concatenated by intervening occurrences of
+            the delimiter.
+
+        Raises
+        ------
+        ValueError
+            - If ``sep_na_rep`` is supplied when ``sep`` is str.
+            - If ``sep`` is array-like and not of equal length with Series/Index.
+        TypeError
+            - If ``string_na_rep`` or ``sep_na_rep`` are not scalar values.
+            - If ``sep`` is not of following types: str or array-like.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']])
+        >>> ser
+        0    [a, b, c]
+        1       [d, e]
+        2          [f]
+        3    [g,  , h]
+        dtype: list
+        >>> ser.str.join(sep='-')
+        0    a-b-c
+        1      d-e
+        2        f
+        3    g- -h
+        dtype: object
+
+        ``sep`` can an array-like input:
+
+        >>> ser.str.join(sep=['-', '+', '.', '='])
+        0    a-b-c
+        1      d+e
+        2        f
+        3    g= =h
+        dtype: object
+
+        If the actual series doesn't have lists, each character is joined
+        by `sep`:
+
+        >>> ser = cudf.Series(['abc', 'def', 'ghi'])
+        >>> ser
+        0    abc
+        1    def
+        2    ghi
+        dtype: object
+        >>> ser.str.join(sep='_')
+        0    a_b_c
+        1    d_e_f
+        2    g_h_i
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists using
+        ``string_na_rep``:
+
+        >>> ser = cudf.Series([['a', 'b', None], None, ['c', 'd']])
+        >>> ser
+        0    [a, b, None]
+        1            None
+        2          [c, d]
+        dtype: list
+        >>> ser.str.join(sep='_', string_na_rep='k')
+        0    a_b_k
+        1     <NA>
+        2      c_d
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists of ``sep``
+        using ``sep_na_rep``:
+
+        >>> ser.str.join(sep=[None, '.', '-'], sep_na_rep='+')
+        0    a+b+
+        1    <NA>
+        2     c-d
+        dtype: object
+        """  # noqa E501
         if sep is None:
             sep = ""
 
@@ -586,7 +684,7 @@ def extract(
         --------
         >>> import cudf
         >>> s = cudf.Series(['a1', 'b2', 'c3'])
-        >>> s.str.extract(r'([ab])(\d)')                                # noqa W605
+        >>> s.str.extract(r'([ab])(\d)')
               0     1
         0     a     1
         1     b     2
@@ -595,7 +693,7 @@ def extract(
         A pattern with one group will return a DataFrame with one
         column if expand=True.
 
-        >>> s.str.extract(r'[ab](\d)', expand=True)                     # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=True)
               0
         0     1
         1     2
@@ -603,12 +701,12 @@ def extract(
 
         A pattern with one group will return a Series if expand=False.
 
-        >>> s.str.extract(r'[ab](\d)', expand=False)                    # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=False)
         0       1
         1       2
         2    <NA>
         dtype: object
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
@@ -696,7 +794,7 @@ def contains(
 
         Returning any digit using regular expression.
 
-        >>> s1.str.contains('\d', regex=True)                               # noqa W605
+        >>> s1.str.contains('\d', regex=True)
         0    False
         1    False
         2    False
@@ -729,7 +827,7 @@ def contains(
         3     True
         4     <NA>
         dtype: bool
-        """
+        """  # noqa W605
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
         elif flags != 0:
@@ -3150,7 +3248,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         Escape ``'$'`` to find the literal dollar sign.
 
         >>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
-        >>> s.str.count('\$')                                       # noqa W605
+        >>> s.str.count('\$')
         0    1
         1    0
         2    1
@@ -3164,7 +3262,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 

From 49e92489efe9240ac8cc9cb6f215d26f931c6b6e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 28 Apr 2021 07:39:49 -0700
Subject: [PATCH 3/4] use empty column

---
 python/cudf/cudf/core/column/string.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f737bff6468..42e7660f9c8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -636,8 +636,11 @@ def _split_by_character(self):
         result_col = cpp_character_tokenize(self._column)
 
         bytes_count = cpp_count_bytes(self._column)
-        offset_col = cudf.core.column.as_column([0], dtype="int32")
-        offset_col = offset_col.append(bytes_count)
+        offset_col = cudf.core.column.column_empty(
+            row_count=len(bytes_count) + 1, dtype="int32"
+        )
+        offset_col[0] = 0
+        offset_col[1:] = bytes_count
         offset_col = offset_col._apply_scan_op("sum")
 
         res = cudf.core.column.ListColumn(

From 743981d218fbc92e5d06e3c42e4dcc0e93fc358b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 28 Apr 2021 07:42:55 -0700
Subject: [PATCH 4/4] add todo

---
 python/cudf/cudf/core/column/string.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 42e7660f9c8..5cb6f53b685 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -599,6 +599,10 @@ def join(
         else:
             # If self._column is not a ListColumn, we will have to
             # split each row by character and create a ListColumn out of it.
+
+            # TODO: Remove this workaround after the following
+            # feature request is resolved
+            # FEA: https://github.com/rapidsai/cudf/issues/8094
             strings_column = self._split_by_character()
 
         if is_scalar(sep):