Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add strings.combine APIs to pylibcudf #16790

Merged
merged 15 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
combine
=======

.. automodule:: pylibcudf.strings.combine
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ strings

capitalize
char_types
combine
contains
extract
find
Expand Down
130 changes: 29 additions & 101 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,11 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cudf._lib.column cimport Column

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.combine cimport (
concatenate as cpp_concatenate,
join_list_elements as cpp_join_list_elements,
join_strings as cpp_join_strings,
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.libcudf.table.table_view cimport table_view
import pylibcudf as plc

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport table_view_from_columns
import cudf


@acquire_spill_lock()
Expand All @@ -31,26 +18,12 @@ def concatenate(list source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = table_view_from_columns(source_strings)

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.concatenate(
plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_concatenate(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -62,27 +35,12 @@ def join(Column source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_strings(
source_strings.to_pylibcudf(mode="read"),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_join_strings(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -96,29 +54,15 @@ def join_lists_with_scalar(
between each string in lists and `<NA>`/`None` values
are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
py_separator.device_value.c_value,
py_narep.device_value.c_value,
cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value,
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
scalar_separator[0],
scalar_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -135,28 +79,12 @@ def join_lists_with_column(
`<NA>`/`None` values in `separator_strings` are replaced
by `py_separator_narep`
"""

cdef DeviceScalar source_narep = py_source_narep.device_value
cdef DeviceScalar separator_narep = py_separator_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view separator_view = separator_strings.view()

cdef const string_scalar* scalar_source_narep = \
<const string_scalar*>(source_narep.get_raw_ptr())
cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
separator_narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
separator_strings.to_pylibcudf(mode="read"),
py_separator_narep.device_value.c_value,
py_source_narep.device_value.c_value,
wence- marked this conversation as resolved.
Show resolved Hide resolved
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
separator_view,
scalar_separator_narep[0],
scalar_source_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx)

set(linked_libraries cudf::cudf)

Expand Down
27 changes: 18 additions & 9 deletions python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport int
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
Expand All @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view

cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:

ctypedef enum separator_on_nulls:
YES 'cudf::strings::separator_on_nulls::YES'
NO 'cudf::strings::separator_on_nulls::NO'
cpdef enum class separator_on_nulls(int):
YES
NO

ctypedef enum output_if_empty_list:
EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
cpdef enum class output_if_empty_list(int):
EMPTY_STRING
NULL_ELEMENT

cdef unique_ptr[column] concatenate(
table_view source_strings,
table_view strings_columns,
string_scalar separator,
string_scalar narep) except +
string_scalar narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] concatenate(
table_view strings_columns,
column_view separators,
string_scalar separator_narep,
string_scalar col_narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] join_strings(
column_view source_strings,
column_view input,
string_scalar separator,
string_scalar narep) except +

Expand Down
Empty file.
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(cython_sources
case.pyx
char_types.pyx
contains.pyx
combine.pyx
extract.pyx
find.pyx
find_multiple.pyx
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from . cimport (
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
Expand Down
33 changes: 33 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.combine cimport (
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep=*,
Scalar col_narep=*,
separator_on_nulls separate_nulls=*,
)

cpdef Column join_strings(Column input, Scalar separator, Scalar narep)


cpdef Column join_list_elements(
Column source_strings,
ColumnOrScalar separator,
Scalar separator_narep,
Scalar string_narep,
separator_on_nulls separate_nulls,
output_if_empty_list empty_list_policy,
)
Loading
Loading