Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate string replace.pxd to pylibcudf #15839

Merged
merged 14 commits into from
Jun 5, 2024
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar


cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# the License.
# =============================================================================

set(cython_sources case.pyx find.pyx)
set(cython_sources case.pyx find.pyx replace.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings ASSOCIATED_TARGETS cudf
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed since the string replace.pyx clashes with the regular replace.pyx

lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
)
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport case, find
from . cimport case, find, replace
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import case, find
from . import case, find, replace
4 changes: 1 addition & 3 deletions python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar
from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar
vyasr marked this conversation as resolved.
Show resolved Hide resolved

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column find(
Column input,
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar
from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar


cpdef Column replace(
Column input,
ColumnOrScalar target,
ColumnOrScalar repl,
size_type maxrepl = *
)
cpdef Column replace_slice(
Column input,
Scalar repl = *,
size_type start = *,
size_type stop = *
)
84 changes: 84 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
replace as cpp_replace,
replace_slice as cpp_replace_slice,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar
from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar


cpdef Column replace(
Column input,
ColumnOrScalar target,
ColumnOrScalar repl,
size_type maxrepl = -1
):
cdef:
unique_ptr[column] c_result
const string_scalar* target_str
const string_scalar* repl_str

if ColumnOrScalar is Scalar:
target_str = <string_scalar *>(target.c_obj.get())
repl_str = <string_scalar *>(repl.c_obj.get())

with nogil:
c_result = move(cpp_replace(
input.view(),
target_str[0],
repl_str[0],
maxrepl,
))
else:
# Column case
# TODO: maxrepl should be supported in the corresponding CUDA/C++ code
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the overload of replace in libcudf where input/target/repl are columns, there isn't a maxrepl arg.

We should probably support this in libcudf replace (eventually), otherwise we'll have some weirdness in pylibcudf where we'll have to raise for maxrepl despite accepting it as an argument.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. Can you raise an issue?

In the meantime, I would recommend that we change the default value of the parameter to None, then raise a NotImplementedError in this branch of the code if we find a non-None value, while in the Scalar branch we set it to -1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with nogil:
c_result = move(cpp_replace(
input.view(),
target.view(),
repl.view(),
))

return Column.from_libcudf(move(c_result))


cpdef Column replace_slice(
Column input,
# TODO: default scalar values
# https://github.com/rapidsai/cudf/issues/15505
Scalar repl = None,
size_type start = 0,
size_type stop = -1
):

cdef unique_ptr[column] c_result

cdef const string_scalar* scalar_str
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

if repl is None:
repl = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

scalar_str = <string_scalar*>(repl.c_obj.get())

with nogil:
c_result = move(cpp_replace_slice(
input.view(),
scalar_str[0],
start,
stop
))

return Column.from_libcudf(move(c_result))
8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/types.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar
98 changes: 25 additions & 73 deletions python/cudf/cudf/_lib/strings/replace.pyx
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
replace as cpp_replace,
replace_slice as cpp_replace_slice,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def slice_replace(Column source_strings,
Expand All @@ -31,22 +24,12 @@ def slice_replace(Column source_strings,

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace_slice(
source_view,
scalar_str[0],
start,
stop
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace_slice(
source_strings.to_pylibcudf(mode="read"),
repl.c_value,
start,
stop
))


@acquire_spill_lock()
Expand All @@ -60,22 +43,12 @@ def insert(Column source_strings,

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace_slice(
source_view,
scalar_str[0],
start,
start
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace_slice(
source_strings.to_pylibcudf(mode="read"),
repl.c_value,
start,
start,
))


@acquire_spill_lock()
Expand All @@ -91,25 +64,12 @@ def replace(Column source_strings,
cdef DeviceScalar target = py_target.device_value
cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_target = <const string_scalar*>(
target.get_raw_ptr()
)
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
)

with nogil:
c_result = move(cpp_replace(
source_view,
scalar_target[0],
scalar_repl[0],
maxrepl
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace(
source_strings.to_pylibcudf(mode="read"),
target.c_value,
repl.c_value,
maxrepl
))


@acquire_spill_lock()
Expand All @@ -120,16 +80,8 @@ def replace_multi(Column source_strings,
Returns a Column after replacing occurrences of
patterns `target_strings` with `repl_strings` in `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view target_view = target_strings.view()
cdef column_view repl_view = repl_strings.view()

with nogil:
c_result = move(cpp_replace(
source_view,
target_view,
repl_view
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc.strings.replace.replace(
source_strings.to_pylibcudf(mode="read"),
target_strings.to_pylibcudf(mode="read"),
repl_strings.to_pylibcudf(mode="read"),
))
Loading
Loading