Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate string replace.pxd to pylibcudf #15839

Merged
merged 14 commits into from
Jun 5, 2024
Merged
8 changes: 7 additions & 1 deletion docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This page provides API documentation for pylibcudf.

.. toctree::
:maxdepth: 1
:caption: API Documentation
:caption: Top-level modules

aggregation
binaryop
Expand All @@ -32,3 +32,9 @@ This page provides API documentation for pylibcudf.
table
types
unary

.. toctree::
:maxdepth: 2
:caption: Submodules
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

strings/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
strings
=======

.. toctree::
:maxdepth: 1

replace
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
replace
=======

.. automodule:: cudf._lib.pylibcudf.strings.replace
:members:
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
)
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport capitalize, case, char_types, find
from . cimport capitalize, case, char_types, find, replace
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import capitalize, case, char_types, find
from . import capitalize, case, char_types, find, replace
25 changes: 25 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar


cpdef Column replace(
Column input,
Scalar target,
Scalar repl,
size_type maxrepl = *
)
cpdef Column replace_multiple(
Column input,
Column target,
Column repl,
size_type maxrepl = *
)
cpdef Column replace_slice(
Column input,
Scalar repl = *,
size_type start = *,
size_type stop = *
)
162 changes: 162 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
replace as cpp_replace,
replace_multiple as cpp_replace_multiple,
replace_slice as cpp_replace_slice,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar


cpdef Column replace(
Column input,
Scalar target,
Scalar repl,
size_type maxrepl = -1
):
"""Replaces target string within each string with the specified replacement string.

Null string entries will return null output string entries.

For details, see :cpp:func:`replace`.

Parameters
----------
input : Column
The input strings
target : Scalar
String to search for in each string.
repl : Scalar
String to replace target with.
maxrepl : size_type, default -1
Maximum times to replace if target appears multiple times in the input string.
Default of -1 specifies replace all occurrences of target in each string.
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
pylibcudf.Column
New string column with target replaced.
"""
cdef:
unique_ptr[column] c_result
const string_scalar* target_str
const string_scalar* repl_str

target_str = <string_scalar *>(target.c_obj.get())
repl_str = <string_scalar *>(repl.c_obj.get())

with nogil:
c_result = move(cpp_replace(
input.view(),
target_str[0],
repl_str[0],
maxrepl,
))

return Column.from_libcudf(move(c_result))


cpdef Column replace_multiple(
Column input,
Column target,
Column repl,
size_type maxrepl = -1
):
"""Replaces target string within each string with the specified replacement string.

Null string entries will return null output string entries.

For details, see :cpp:func:`replace_multiple`.

Parameters
----------
input : Column
The input strings
target : Column
Column containing strings to search for in the input column.
repl : Column
Column containing strings to replace target with.
Each target, when found, will be replaced by the value at the
corresponding index in the repl Column.

Must be of the same length as target.

Returns
-------
pylibcudf.Column
New string column with target replaced.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_replace_multiple(
input.view(),
target.view(),
repl.view(),
))

return Column.from_libcudf(move(c_result))


cpdef Column replace_slice(
Column input,
# TODO: default scalar values
# https://github.com/rapidsai/cudf/issues/15505
Scalar repl = None,
size_type start = 0,
size_type stop = -1
):
"""Replaces each string in the column with the provided repl string
within the [start,stop) character position range.

Null string entries will return null output string entries.
This function can be used to insert a string into specific position
by specifying the same position value for start and stop.
The repl string can be appended to each string by specifying -1
for both start and stop.

For details, see :cpp:func:`replace_slice`.

Parameters
----------
input : Column
The input strings
repl : Scalar, default ""
String scalar to replace target with.
start : size_type, default 0
Start position where repl will be added.
stop : size_type, default -1
End position (exclusive) to use for replacement.
Returns
-------
pylibcudf.Column
New string column
"""
cdef unique_ptr[column] c_result

if repl is None:
repl = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

cdef const string_scalar* scalar_str = <string_scalar*>(repl.c_obj.get())

with nogil:
c_result = move(cpp_replace_slice(
input.view(),
scalar_str[0],
start,
stop
))

return Column.from_libcudf(move(c_result))
99 changes: 25 additions & 74 deletions python/cudf/cudf/_lib/strings/replace.pyx
Original file line number Diff line number Diff line change
@@ -1,23 +1,15 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
replace as cpp_replace,
replace_multiple as cpp_replace_multiple,
replace_slice as cpp_replace_slice,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def slice_replace(Column source_strings,
Expand All @@ -32,22 +24,12 @@ def slice_replace(Column source_strings,

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace_slice(
source_view,
scalar_str[0],
start,
stop
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace_slice(
source_strings.to_pylibcudf(mode="read"),
repl.c_value,
start,
stop
))


@acquire_spill_lock()
Expand All @@ -61,22 +43,12 @@ def insert(Column source_strings,

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace_slice(
source_view,
scalar_str[0],
start,
start
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace_slice(
source_strings.to_pylibcudf(mode="read"),
repl.c_value,
start,
start,
))


@acquire_spill_lock()
Expand All @@ -92,25 +64,12 @@ def replace(Column source_strings,
cdef DeviceScalar target = py_target.device_value
cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_target = <const string_scalar*>(
target.get_raw_ptr()
)
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace(
source_view,
scalar_target[0],
scalar_repl[0],
maxrepl
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace(
source_strings.to_pylibcudf(mode="read"),
target.c_value,
repl.c_value,
maxrepl
))


@acquire_spill_lock()
Expand All @@ -121,16 +80,8 @@ def replace_multi(Column source_strings,
Returns a Column after replacing occurrences of
patterns `target_strings` with `repl_strings` in `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view target_view = target_strings.view()
cdef column_view repl_view = repl_strings.view()

with nogil:
c_result = move(cpp_replace_multiple(
source_view,
target_view,
repl_view
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace_multiple(
source_strings.to_pylibcudf(mode="read"),
target_strings.to_pylibcudf(mode="read"),
repl_strings.to_pylibcudf(mode="read"),
))
Loading
Loading