diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst new file mode 100644 index 00000000000..38a46641200 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst @@ -0,0 +1,6 @@ +======= +combine +======= + +.. automodule:: pylibcudf.strings.combine + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 65dc5d2d1c3..c8c0016126d 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,6 +6,7 @@ strings capitalize char_types + combine contains extract find diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 76cc13db0da..0f7b27d85d7 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -2,24 +2,11 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +from cudf._lib.column cimport Column -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.combine cimport ( - concatenate as cpp_concatenate, - join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, - output_if_empty_list, - separator_on_nulls, -) -from pylibcudf.libcudf.table.table_view cimport table_view +import pylibcudf as plc -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns +import cudf @acquire_spill_lock() @@ -31,26 +18,12 @@ def concatenate(list source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_columns(source_strings) - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.concatenate( + plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_concatenate( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -62,27 +35,12 @@ def join(Column source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_strings( + source_strings.to_pylibcudf(mode="read"), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -96,29 +54,15 @@ def join_lists_with_scalar( between each string in lists and ``/`None` values are replaced by `py_narep` """ - - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + py_separator.device_value.c_value, + py_narep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - scalar_separator[0], - scalar_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -135,28 +79,12 @@ def join_lists_with_column( ``/`None` values in `separator_strings` are replaced by `py_separator_narep` """ - - cdef DeviceScalar source_narep = py_source_narep.device_value - cdef DeviceScalar separator_narep = py_separator_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view separator_view = separator_strings.view() - - cdef const string_scalar* scalar_source_narep = \ - (source_narep.get_raw_ptr()) - cdef const string_scalar* scalar_separator_narep = ( - separator_narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + separator_strings.to_pylibcudf(mode="read"), + py_separator_narep.device_value.c_value, + py_source_narep.device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - separator_view, - scalar_separator_narep[0], - scalar_source_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index b8b4343173e..f5f2113332a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx) +set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd index e4c9fa5817a..e659993b834 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: - ctypedef enum separator_on_nulls: - YES 'cudf::strings::separator_on_nulls::YES' - NO 'cudf::strings::separator_on_nulls::NO' + cpdef enum class separator_on_nulls(int): + YES + NO - ctypedef enum output_if_empty_list: - EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING' - NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT' + cpdef enum class output_if_empty_list(int): + EMPTY_STRING + NULL_ELEMENT cdef unique_ptr[column] concatenate( - table_view source_strings, + table_view strings_columns, string_scalar separator, - string_scalar narep) except + + string_scalar narep, + separator_on_nulls separate_nulls) except + + + cdef unique_ptr[column] concatenate( + table_view strings_columns, + column_view separators, + string_scalar separator_narep, + string_scalar col_narep, + separator_on_nulls separate_nulls) except + cdef unique_ptr[column] join_strings( - column_view source_strings, + column_view input, string_scalar separator, string_scalar narep) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index eeb44d19333..04dd131cd75 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -18,6 +18,7 @@ set(cython_sources case.pyx char_types.pyx contains.pyx + combine.pyx extract.pyx find.pyx find_multiple.pyx diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index e45048a500f..93c61f3f72c 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,6 +5,7 @@ from . cimport ( capitalize, case, char_types, + combine, contains, convert, extract, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index c6253d94b40..d52b0405f1e 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,6 +5,7 @@ capitalize, case, char_types, + combine, contains, convert, extract, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd new file mode 100644 index 00000000000..ea22f626973 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.combine cimport ( + output_if_empty_list, + separator_on_nulls, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=*, + Scalar col_narep=*, + separator_on_nulls separate_nulls=*, +) + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep) + + +cpdef Column join_list_elements( + Column source_strings, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx new file mode 100644 index 00000000000..f17d5265ab4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -0,0 +1,223 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport combine as cpp_combine +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference +from pylibcudf.libcudf.strings.combine import \ + output_if_empty_list as OutputIfEmptyList # no-cython-lint +from pylibcudf.libcudf.strings.combine import \ + separator_on_nulls as SeparatorOnNulls # no-cython-lint + + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=None, + Scalar col_narep=None, + separator_on_nulls separate_nulls=separator_on_nulls.YES, +): + """ + Concatenate all columns in the table horizontally into one new string + delimited by an optional separator string. + + Parameters + ---------- + strings_columns : Table + Strings for this operation + + separator : Column or Scalar + Separator(s) for a given row + + narep : Scalar + String to replace a null separator for a given row. + + col_narep : Scalar + String that should be used in place of any null strings found in any column. + An exception is raised when separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows. + + Returns + ------- + Column + New column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_col_narep + cdef const string_scalar* c_separator + + if narep is None: + narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + + if ColumnOrScalar is Column: + if col_narep is None: + col_narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + c_col_narep = ( + col_narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + separator.view(), + dereference(c_narep), + dereference(c_col_narep), + separate_nulls + ) + ) + elif ColumnOrScalar is Scalar: + if col_narep is not None: + raise ValueError( + "col_narep cannot be specified when separator is a Scalar" + ) + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + dereference(c_separator), + dereference(c_narep), + separate_nulls + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + input : Column + List of strings columns to concatenate + + separator : Scalar + Strings column that provides the separator for a given row + + narep : Scalar + String to replace any null strings found. + + Returns + ------- + Column + New column containing one string + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.join_strings( + input.view(), + dereference(c_separator), + dereference(c_narep), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_list_elements( + Column lists_strings_column, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +): + """ + Given a lists column of strings (each row is a list of strings), + concatenates the strings within each row and returns a single strings + column result. + + Parameters + ---------- + lists_strings_column : Column + Column containing lists of strings to concatenate + + separator : Column or Scalar + String(s) that should inserted between each string from each row. + + separator_narep : Scalar + String that should be used to replace a null separator. + + string_narep : Scalar + String to replace null strings in any non-null list row. + Ignored if separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows + if `narep` is valid + + empty_list_policy : OutputIfEmptyList + If set to EMPTY_STRING, any input row that is an empty + list will result in an empty string. Otherwise, it will + result in a null. + + + Returns + ------- + Column + New strings column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator_narep = ( + separator_narep.c_obj.get() + ) + cdef const string_scalar* c_string_narep = ( + string_narep.c_obj.get() + ) + cdef const string_scalar* c_separator + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + separator.view(), + dereference(c_separator_narep), + dereference(c_string_narep), + separate_nulls, + empty_list_policy, + ) + ) + elif ColumnOrScalar is Scalar: + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + dereference(c_separator), + dereference(c_separator_narep), + separate_nulls, + empty_list_policy, + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py new file mode 100644 index 00000000000..4a7007a0d6b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +def test_concatenate_scalar_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.scalar("-")) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "-b", "c-"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!")) + ) + expected = pa.array(["a-a", "!-b", "c-!"]) + assert_column_eq(result, expected) + + with pytest.raises(ValueError): + plc.strings.combine.concatenate( + plc_table, + sep, + narep=plc.interop.from_arrow(pa.scalar("!")), + col_narep=plc.interop.from_arrow(pa.scalar("?")), + ) + + +def test_concatenate_column_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.array(["-", "?", ","])) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "?b", "c,"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, + plc.interop.from_arrow(pa.array([None, "?", ","])), + narep=plc.interop.from_arrow(pa.scalar("1")), + col_narep=plc.interop.from_arrow(pa.scalar("*")), + ) + expected = pa.array(["a1a", "*?b", "c,*"]) + assert_column_eq(result, expected) + + +def test_join_strings(): + pa_arr = pa.array(list("abc")) + sep = pa.scalar("") + result = plc.strings.combine.join_strings( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + ) + expected = pa.array(["abc"]) + assert_column_eq(result, expected) + + +def test_join_list_elements(): + pa_arr = pa.array([["a", "a"], ["b", "b"]]) + sep = pa.scalar("") + result = plc.strings.combine.join_list_elements( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar("")), + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + assert_column_eq(result, expected)