Skip to content

Commit

Permalink
Add strings.combine APIs to pylibcudf (#16790)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

URL: #16790
mroeschke authored Oct 17, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 5f863a5 commit 3683e46
Showing 12 changed files with 397 additions and 111 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
combine
=======

.. automodule:: pylibcudf.strings.combine
:members:
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@ strings

capitalize
char_types
combine
contains
extract
find
130 changes: 29 additions & 101 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
@@ -2,24 +2,11 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cudf._lib.column cimport Column

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.combine cimport (
concatenate as cpp_concatenate,
join_list_elements as cpp_join_list_elements,
join_strings as cpp_join_strings,
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.libcudf.table.table_view cimport table_view
import pylibcudf as plc

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport table_view_from_columns
import cudf


@acquire_spill_lock()
@@ -31,26 +18,12 @@ def concatenate(list source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = table_view_from_columns(source_strings)

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.concatenate(
plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_concatenate(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
@@ -62,27 +35,12 @@ def join(Column source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_strings(
source_strings.to_pylibcudf(mode="read"),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_join_strings(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
@@ -96,29 +54,15 @@ def join_lists_with_scalar(
between each string in lists and `<NA>`/`None` values
are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
py_separator.device_value.c_value,
py_narep.device_value.c_value,
cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value,
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
scalar_separator[0],
scalar_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
@@ -135,28 +79,12 @@ def join_lists_with_column(
`<NA>`/`None` values in `separator_strings` are replaced
by `py_separator_narep`
"""

cdef DeviceScalar source_narep = py_source_narep.device_value
cdef DeviceScalar separator_narep = py_separator_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view separator_view = separator_strings.view()

cdef const string_scalar* scalar_source_narep = \
<const string_scalar*>(source_narep.get_raw_ptr())
cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
separator_narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
separator_strings.to_pylibcudf(mode="read"),
py_separator_narep.device_value.c_value,
py_source_narep.device_value.c_value,
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
separator_view,
scalar_separator_narep[0],
scalar_source_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx)

set(linked_libraries cudf::cudf)

27 changes: 18 additions & 9 deletions python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport int
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
@@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view

cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:

ctypedef enum separator_on_nulls:
YES 'cudf::strings::separator_on_nulls::YES'
NO 'cudf::strings::separator_on_nulls::NO'
cpdef enum class separator_on_nulls(int):
YES
NO

ctypedef enum output_if_empty_list:
EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
cpdef enum class output_if_empty_list(int):
EMPTY_STRING
NULL_ELEMENT

cdef unique_ptr[column] concatenate(
table_view source_strings,
table_view strings_columns,
string_scalar separator,
string_scalar narep) except +
string_scalar narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] concatenate(
table_view strings_columns,
column_view separators,
string_scalar separator_narep,
string_scalar col_narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] join_strings(
column_view source_strings,
column_view input,
string_scalar separator,
string_scalar narep) except +

Empty file.
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ set(cython_sources
case.pyx
char_types.pyx
contains.pyx
combine.pyx
extract.pyx
find.pyx
find_multiple.pyx
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ from . cimport (
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
33 changes: 33 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.combine cimport (
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep=*,
Scalar col_narep=*,
separator_on_nulls separate_nulls=*,
)

cpdef Column join_strings(Column input, Scalar separator, Scalar narep)


cpdef Column join_list_elements(
Column source_strings,
ColumnOrScalar separator,
Scalar separator_narep,
Scalar string_narep,
separator_on_nulls separate_nulls,
output_if_empty_list empty_list_policy,
)
223 changes: 223 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.strings cimport combine as cpp_combine
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

from cython.operator import dereference
from pylibcudf.libcudf.strings.combine import \
output_if_empty_list as OutputIfEmptyList # no-cython-lint
from pylibcudf.libcudf.strings.combine import \
separator_on_nulls as SeparatorOnNulls # no-cython-lint


cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep=None,
Scalar col_narep=None,
separator_on_nulls separate_nulls=separator_on_nulls.YES,
):
"""
Concatenate all columns in the table horizontally into one new string
delimited by an optional separator string.
Parameters
----------
strings_columns : Table
Strings for this operation
separator : Column or Scalar
Separator(s) for a given row
narep : Scalar
String to replace a null separator for a given row.
col_narep : Scalar
String that should be used in place of any null strings found in any column.
An exception is raised when separator is a Scalar.
separate_nulls : SeparatorOnNulls
If YES, then the separator is included for null rows.
Returns
-------
Column
New column with concatenated results
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_col_narep
cdef const string_scalar* c_separator

if narep is None:
narep = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)
cdef const string_scalar* c_narep = <const string_scalar*>(
narep.c_obj.get()
)

if ColumnOrScalar is Column:
if col_narep is None:
col_narep = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)
c_col_narep = <const string_scalar*>(
col_narep.c_obj.get()
)
with nogil:
c_result = move(
cpp_combine.concatenate(
strings_columns.view(),
separator.view(),
dereference(c_narep),
dereference(c_col_narep),
separate_nulls
)
)
elif ColumnOrScalar is Scalar:
if col_narep is not None:
raise ValueError(
"col_narep cannot be specified when separator is a Scalar"
)
c_separator = <const string_scalar*>(separator.c_obj.get())
with nogil:
c_result = move(
cpp_combine.concatenate(
strings_columns.view(),
dereference(c_separator),
dereference(c_narep),
separate_nulls
)
)
else:
raise ValueError("separator must be a Column or a Scalar")
return Column.from_libcudf(move(c_result))


cpdef Column join_strings(Column input, Scalar separator, Scalar narep):
"""
Concatenates all strings in the column into one new string delimited
by an optional separator string.
Parameters
----------
input : Column
List of strings columns to concatenate
separator : Scalar
Strings column that provides the separator for a given row
narep : Scalar
String to replace any null strings found.
Returns
-------
Column
New column containing one string
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_separator = <const string_scalar*>(
separator.c_obj.get()
)
cdef const string_scalar* c_narep = <const string_scalar*>(
narep.c_obj.get()
)
with nogil:
c_result = move(
cpp_combine.join_strings(
input.view(),
dereference(c_separator),
dereference(c_narep),
)
)

return Column.from_libcudf(move(c_result))


cpdef Column join_list_elements(
Column lists_strings_column,
ColumnOrScalar separator,
Scalar separator_narep,
Scalar string_narep,
separator_on_nulls separate_nulls,
output_if_empty_list empty_list_policy,
):
"""
Given a lists column of strings (each row is a list of strings),
concatenates the strings within each row and returns a single strings
column result.
Parameters
----------
lists_strings_column : Column
Column containing lists of strings to concatenate
separator : Column or Scalar
String(s) that should inserted between each string from each row.
separator_narep : Scalar
String that should be used to replace a null separator.
string_narep : Scalar
String to replace null strings in any non-null list row.
Ignored if separator is a Scalar.
separate_nulls : SeparatorOnNulls
If YES, then the separator is included for null rows
if `narep` is valid
empty_list_policy : OutputIfEmptyList
If set to EMPTY_STRING, any input row that is an empty
list will result in an empty string. Otherwise, it will
result in a null.
Returns
-------
Column
New strings column with concatenated results
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_separator_narep = <const string_scalar*>(
separator_narep.c_obj.get()
)
cdef const string_scalar* c_string_narep = <const string_scalar*>(
string_narep.c_obj.get()
)
cdef const string_scalar* c_separator

if ColumnOrScalar is Column:
with nogil:
c_result = move(
cpp_combine.join_list_elements(
lists_strings_column.view(),
separator.view(),
dereference(c_separator_narep),
dereference(c_string_narep),
separate_nulls,
empty_list_policy,
)
)
elif ColumnOrScalar is Scalar:
c_separator = <const string_scalar*>(separator.c_obj.get())
with nogil:
c_result = move(
cpp_combine.join_list_elements(
lists_strings_column.view(),
dereference(c_separator),
dereference(c_separator_narep),
separate_nulls,
empty_list_policy,
)
)
else:
raise ValueError("separator must be a Column or a Scalar")
return Column.from_libcudf(move(c_result))
83 changes: 83 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pyarrow.compute as pc
import pylibcudf as plc
import pytest
from utils import assert_column_eq


def test_concatenate_scalar_seperator():
plc_table = plc.interop.from_arrow(
pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
)
sep = plc.interop.from_arrow(pa.scalar("-"))
result = plc.strings.combine.concatenate(
plc_table,
sep,
)
expected = pa.array(["a-a", "-b", "c-"])
assert_column_eq(result, expected)

result = plc.strings.combine.concatenate(
plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!"))
)
expected = pa.array(["a-a", "!-b", "c-!"])
assert_column_eq(result, expected)

with pytest.raises(ValueError):
plc.strings.combine.concatenate(
plc_table,
sep,
narep=plc.interop.from_arrow(pa.scalar("!")),
col_narep=plc.interop.from_arrow(pa.scalar("?")),
)


def test_concatenate_column_seperator():
plc_table = plc.interop.from_arrow(
pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
)
sep = plc.interop.from_arrow(pa.array(["-", "?", ","]))
result = plc.strings.combine.concatenate(
plc_table,
sep,
)
expected = pa.array(["a-a", "?b", "c,"])
assert_column_eq(result, expected)

result = plc.strings.combine.concatenate(
plc_table,
plc.interop.from_arrow(pa.array([None, "?", ","])),
narep=plc.interop.from_arrow(pa.scalar("1")),
col_narep=plc.interop.from_arrow(pa.scalar("*")),
)
expected = pa.array(["a1a", "*?b", "c,*"])
assert_column_eq(result, expected)


def test_join_strings():
pa_arr = pa.array(list("abc"))
sep = pa.scalar("")
result = plc.strings.combine.join_strings(
plc.interop.from_arrow(pa_arr),
plc.interop.from_arrow(sep),
plc.interop.from_arrow(pa.scalar("")),
)
expected = pa.array(["abc"])
assert_column_eq(result, expected)


def test_join_list_elements():
pa_arr = pa.array([["a", "a"], ["b", "b"]])
sep = pa.scalar("")
result = plc.strings.combine.join_list_elements(
plc.interop.from_arrow(pa_arr),
plc.interop.from_arrow(sep),
plc.interop.from_arrow(pa.scalar("")),
plc.interop.from_arrow(pa.scalar("")),
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)
expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep)
assert_column_eq(result, expected)

0 comments on commit 3683e46

Please sign in to comment.