From dde5c8b937f2b4a5dc45ef7167d011af2899fdf5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Apr 2021 14:01:44 -0700 Subject: [PATCH 1/4] enable str.join API --- python/cudf/cudf/_lib/cpp/strings/combine.pxd | 13 +- python/cudf/cudf/_lib/strings/combine.pyx | 76 ++++++++++- python/cudf/cudf/core/column/string.py | 82 +++++++++++- python/cudf/cudf/tests/test_string.py | 122 +++++++++++++++++- 4 files changed, 285 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 2670c67908f..250c6441882 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view @@ -17,3 +17,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: column_view source_strings, string_scalar separator, string_scalar narep) except + + + cdef unique_ptr[column] concatenate_list_elements( + column_view lists_strings_column, + column_view separators, + string_scalar separator_narep, + string_scalar string_narep) except + + + cdef unique_ptr[column] concatenate_list_elements( + column_view lists_strings_column, + string_scalar separator, + string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 04fde5be9e8..25619de3ed0 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -15,7 +15,8 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, - join_strings as cpp_join_strings + join_strings as cpp_join_strings, + concatenate_list_elements as cpp_concatenate_list_elements ) @@ -78,3 +79,74 @@ def join(Column source_strings, )) return Column.from_unique_ptr(move(c_result)) + + +def join_lists_with_scalar( + Column source_strings, + object py_separator, + object py_narep): + """ + Returns a Column by concatenating Lists of strings row-wise + in `source_strings` with the specified `py_separator` + between each string in lists and ``/`None` values + are replaced by `py_narep` + """ + + cdef DeviceScalar separator = py_separator.device_value + cdef DeviceScalar narep = py_narep.device_value + + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + cdef const string_scalar* scalar_separator = \ + (separator.get_raw_ptr()) + cdef const string_scalar* scalar_narep = ( + narep.get_raw_ptr() + ) + + with nogil: + c_result = move(cpp_concatenate_list_elements( + source_view, + scalar_separator[0], + scalar_narep[0] + )) + + return Column.from_unique_ptr(move(c_result)) + + +def join_lists_with_column( + Column source_strings, + Column separator_strings, + object py_source_narep, + object py_separator_narep): + """ + Returns a Column by concatenating Lists of strings row-wise in + `source_strings` with a corresponding separator at the same + position in `separator_strings` and ``/`None` values in + `source_strings` are replaced by `py_source_narep` and + ``/`None` values in `separator_strings` are replaced + by `py_separator_narep` + """ + + cdef DeviceScalar source_narep = py_source_narep.device_value + cdef DeviceScalar separator_narep = py_separator_narep.device_value + + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + cdef column_view separator_view = separator_strings.view() + + cdef const string_scalar* scalar_source_narep = \ + (source_narep.get_raw_ptr()) + cdef const string_scalar* scalar_separator_narep = ( + separator_narep.get_raw_ptr() + ) + + with nogil: + c_result = move(cpp_concatenate_list_elements( + source_view, + separator_view, + scalar_separator_narep[0], + scalar_source_narep[0] + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index de2df9b50d7..bd94b93f985 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -78,6 +78,8 @@ from cudf._lib.strings.combine import ( concatenate as cpp_concatenate, join as cpp_join, + join_lists_with_column as cpp_join_lists_with_column, + join_lists_with_scalar as cpp_join_lists_with_scalar, ) from cudf._lib.strings.contains import ( contains_re as cpp_contains_re, @@ -464,7 +466,9 @@ def cat(self, others=None, sep=None, na_rep=None): out = out[0] return out - def join(self, sep) -> ParentType: + def join( + self, sep=None, string_na_rep=None, sep_na_rep=None + ) -> ParentType: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -472,9 +476,81 @@ def join(self, sep) -> ParentType: Raises : NotImplementedError Columns of arrays / lists are not yet supported. """ - raise NotImplementedError( - "Columns of arrays / lists are not yet " "supported" + if sep is None: + sep = "" + + if string_na_rep is None: + string_na_rep = "" + + if is_scalar(sep) and sep_na_rep: + raise ValueError( + "sep_na_rep cannot be defined when `sep` is scalar." + ) + + if sep_na_rep is None: + sep_na_rep = "" + + if not is_scalar(string_na_rep): + raise TypeError( + f"string_na_rep should be a string scalar, got {string_na_rep}" + f" of type : {type(string_na_rep)}" + ) + + if isinstance(self._column, cudf.core.column.ListColumn): + strings_column = self._column + else: + # If self._column is not a ListColumn, we will have to + # split each row by character and create a ListColumn out of it. + strings_column = self._split_by_character() + + if is_scalar(sep): + data = cpp_join_lists_with_scalar( + strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) + ) + elif can_convert_to_column(sep): + sep_column = column.as_column(sep) + if len(sep_column) != len(strings_column): + raise ValueError( + f"sep should be of similar size to the series, " + f"got: {len(sep_column)}, expected: {len(strings_column)}" + ) + if not is_scalar(sep_na_rep): + raise TypeError( + f"sep_na_rep should be a string scalar, got {sep_na_rep} " + f"of type: {type(sep_na_rep)}" + ) + + data = cpp_join_lists_with_column( + strings_column, + sep_column, + cudf.Scalar(string_na_rep), + cudf.Scalar(sep_na_rep), + ) + else: + raise TypeError( + f"sep should be an str, array-like or Series object, " + f"found {type(sep)}" + ) + + return self._return_or_inplace(data) + + def _split_by_character(self): + result_col = cpp_character_tokenize(self._column) + + bytes_count = cpp_count_bytes(self._column) + offset_col = cudf.core.column.as_column([0], dtype="int32") + offset_col = offset_col.append(bytes_count) + offset_col = offset_col._apply_scan_op("sum") + + res = cudf.core.column.ListColumn( + size=len(self._column), + dtype=cudf.ListDtype(self._column.dtype), + mask=self._column.mask, + offset=0, + null_count=self._column.null_count, + children=(offset_col, result_col), ) + return res def extract( self, pat: str, flags: int = 0, expand: bool = True diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2ca6bc622be..0ff5b81ce81 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -807,8 +807,7 @@ def test_string_cat_str_error(): gs.str.cat(gs.str) -@pytest.mark.xfail(raises=(NotImplementedError, AttributeError)) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) +@pytest.mark.parametrize("sep", ["", " ", "|", ",", "|||"]) def test_string_join(ps_gs, sep): ps, gs = ps_gs @@ -2931,3 +2930,122 @@ def test_string_slice_with_mask(): assert_eq(actual._column.null_count, expected._column.null_count) assert_eq(actual, expected) + + +def test_str_join_lists_error(): + sr = cudf.Series([["a", "a"], ["b"], ["c"]]) + + with pytest.raises( + ValueError, match="sep_na_rep cannot be defined when `sep` is scalar." + ): + sr.str.join(sep="-", sep_na_rep="-") + + with pytest.raises( + TypeError, + match=re.escape( + "string_na_rep should be a string scalar, got [10, 20] of type " + ": " + ), + ): + sr.str.join(string_na_rep=[10, 20]) + + with pytest.raises( + ValueError, + match=re.escape( + "sep should be of similar size to the series, got: 2, expected: 3" + ), + ): + sr.str.join(sep=["=", "-"]) + + with pytest.raises( + TypeError, + match=re.escape( + "sep_na_rep should be a string scalar, got " + "['na'] of type: " + ), + ): + sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"]) + + with pytest.raises( + TypeError, + match=re.escape( + "sep should be an str, array-like or Series object, " + "found " + ), + ): + sr.str.join(sep=cudf.DataFrame()) + + +@pytest.mark.parametrize( + "sr,sep,string_na_rep,sep_na_rep,expected", + [ + ( + cudf.Series([["a", "a"], ["b"], ["c"]]), + "-", + None, + None, + cudf.Series(["a-a", "b", "c"]), + ), + ( + cudf.Series([["a", "b"], [None], [None, "hello", None, "world"]]), + "__", + "=", + None, + cudf.Series(["a__b", "=", "=__hello__=__world"]), + ), + ( + cudf.Series( + [ + ["a", None, "b"], + [None], + [None, "hello", None, "world"], + None, + ] + ), + ["-", "_", "**", "!"], + None, + None, + cudf.Series(["a--b", "", "**hello****world", None]), + ), + ( + cudf.Series( + [ + ["a", None, "b"], + [None], + [None, "hello", None, "world"], + None, + ] + ), + ["-", "_", "**", None], + "rep_str", + "sep_str", + cudf.Series( + [ + "a-rep_str-b", + "rep_str", + "rep_str**hello**rep_str**world", + None, + ] + ), + ), + ( + cudf.Series([[None, "a"], [None], None]), + ["-", "_", None], + "rep_str", + None, + cudf.Series(["rep_str-a", "rep_str", None]), + ), + ( + cudf.Series([[None, "a"], [None], None]), + ["-", "_", None], + None, + "sep_str", + cudf.Series(["-a", "", None]), + ), + ], +) +def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): + actual = sr.str.join( + sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep + ) + assert_eq(actual, expected) From 59a488fe40a61f8799eb8573665ed8fea91303e4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Apr 2021 14:49:28 -0700 Subject: [PATCH 2/4] add docs --- python/cudf/cudf/core/column/string.py | 120 ++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index bd94b93f985..f737bff6468 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -473,9 +473,107 @@ def join( Join lists contained as elements in the Series/Index with passed delimiter. - Raises : NotImplementedError - Columns of arrays / lists are not yet supported. - """ + If the elements of a Series are lists themselves, join the content of + these lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str or array-like + If str, the delimiter is used between list entries. + If array-like, the string at a position is used as a + delimiter for corresponding row of the list entries. + string_na_rep : str, default None + This character will take the place of any null strings + (not empty strings) in the Series. + If ``string_na_rep`` is ``None``, it defaults to empty + space "". + sep_na_rep : str, default None + This character will take the place of any null strings + (not empty strings) in `sep`. This parameter can be used + only if `sep` is array-like. If ``sep_na_rep`` is ``None``, + it defaults to empty space "". + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of + the delimiter. + + Raises + ------ + ValueError + - If ``sep_na_rep`` is supplied when ``sep`` is str. + - If ``sep`` is array-like and not of equal length with Series/Index. + TypeError + - If ``string_na_rep`` or ``sep_na_rep`` are not scalar values. + - If ``sep`` is not of following types: str or array-like. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']]) + >>> ser + 0 [a, b, c] + 1 [d, e] + 2 [f] + 3 [g, , h] + dtype: list + >>> ser.str.join(sep='-') + 0 a-b-c + 1 d-e + 2 f + 3 g- -h + dtype: object + + ``sep`` can an array-like input: + + >>> ser.str.join(sep=['-', '+', '.', '=']) + 0 a-b-c + 1 d+e + 2 f + 3 g= =h + dtype: object + + If the actual series doesn't have lists, each character is joined + by `sep`: + + >>> ser = cudf.Series(['abc', 'def', 'ghi']) + >>> ser + 0 abc + 1 def + 2 ghi + dtype: object + >>> ser.str.join(sep='_') + 0 a_b_c + 1 d_e_f + 2 g_h_i + dtype: object + + We can replace ``/`None` values present in lists using + ``string_na_rep``: + + >>> ser = cudf.Series([['a', 'b', None], None, ['c', 'd']]) + >>> ser + 0 [a, b, None] + 1 None + 2 [c, d] + dtype: list + >>> ser.str.join(sep='_', string_na_rep='k') + 0 a_b_k + 1 + 2 c_d + dtype: object + + We can replace ``/`None` values present in lists of ``sep`` + using ``sep_na_rep``: + + >>> ser.str.join(sep=[None, '.', '-'], sep_na_rep='+') + 0 a+b+ + 1 + 2 c-d + dtype: object + """ # noqa E501 if sep is None: sep = "" @@ -586,7 +684,7 @@ def extract( -------- >>> import cudf >>> s = cudf.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') # noqa W605 + >>> s.str.extract(r'([ab])(\d)') 0 1 0 a 1 1 b 2 @@ -595,7 +693,7 @@ def extract( A pattern with one group will return a DataFrame with one column if expand=True. - >>> s.str.extract(r'[ab](\d)', expand=True) # noqa W605 + >>> s.str.extract(r'[ab](\d)', expand=True) 0 0 1 1 2 @@ -603,12 +701,12 @@ def extract( A pattern with one group will return a Series if expand=False. - >>> s.str.extract(r'[ab](\d)', expand=False) # noqa W605 + >>> s.str.extract(r'[ab](\d)', expand=False) 0 1 1 2 2 dtype: object - """ + """ # noqa W605 if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") @@ -696,7 +794,7 @@ def contains( Returning any digit using regular expression. - >>> s1.str.contains('\d', regex=True) # noqa W605 + >>> s1.str.contains('\d', regex=True) 0 False 1 False 2 False @@ -729,7 +827,7 @@ def contains( 3 True 4 dtype: bool - """ + """ # noqa W605 if case is not True: raise NotImplementedError("`case` parameter is not yet supported") elif flags != 0: @@ -3150,7 +3248,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType: Escape ``'$'`` to find the literal dollar sign. >>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\$') # noqa W605 + >>> s.str.count('\$') 0 1 1 0 2 1 @@ -3164,7 +3262,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType: >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') - """ + """ # noqa W605 if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") From 49e92489efe9240ac8cc9cb6f215d26f931c6b6e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 28 Apr 2021 07:39:49 -0700 Subject: [PATCH 3/4] use empty column --- python/cudf/cudf/core/column/string.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f737bff6468..42e7660f9c8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -636,8 +636,11 @@ def _split_by_character(self): result_col = cpp_character_tokenize(self._column) bytes_count = cpp_count_bytes(self._column) - offset_col = cudf.core.column.as_column([0], dtype="int32") - offset_col = offset_col.append(bytes_count) + offset_col = cudf.core.column.column_empty( + row_count=len(bytes_count) + 1, dtype="int32" + ) + offset_col[0] = 0 + offset_col[1:] = bytes_count offset_col = offset_col._apply_scan_op("sum") res = cudf.core.column.ListColumn( From 743981d218fbc92e5d06e3c42e4dcc0e93fc358b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 28 Apr 2021 07:42:55 -0700 Subject: [PATCH 4/4] add todo --- python/cudf/cudf/core/column/string.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 42e7660f9c8..5cb6f53b685 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -599,6 +599,10 @@ def join( else: # If self._column is not a ListColumn, we will have to # split each row by character and create a ListColumn out of it. + + # TODO: Remove this workaround after the following + # feature request is resolved + # FEA: https://github.com/rapidsai/cudf/issues/8094 strings_column = self._split_by_character() if is_scalar(sep):