From d49ed89821bb44ee5751c127009b3e1b1aaaa838 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 23 May 2024 09:10:45 -0700 Subject: [PATCH 1/9] Migrate string replace.pxd to pylibcudf --- .../libcudf/scalar/scalar_factories.pxd | 10 ++ .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../cudf/cudf/_lib/pylibcudf/strings/find.pxd | 4 +- .../cudf/_lib/pylibcudf/strings/replace.pxd | 20 ++++ .../cudf/_lib/pylibcudf/strings/replace.pyx | 84 ++++++++++++++++ .../cudf/_lib/pylibcudf/strings/types.pxd | 8 ++ python/cudf/cudf/_lib/strings/replace.pyx | 98 +++++-------------- 9 files changed, 152 insertions(+), 80 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/types.pxd diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd new file mode 100644 index 00000000000..5c4e5bf346f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar + + +cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: + cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c42b57ece63..aceadbf574e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,10 +12,10 @@ # the License. # ============================================================================= -set(cython_sources case.pyx find.pyx) +set(cython_sources case.pyx find.pyx replace.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings ASSOCIATED_TARGETS cudf ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 33e2d56c087..eafacbc0393 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport case, find +from . cimport case, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 9220f6bd045..711ae759839 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case, find +from . import case, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd index bb43069f190..69ecb3e1f67 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd @@ -3,10 +3,8 @@ from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.pylibcudf.scalar cimport Scalar +from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar -ctypedef fused ColumnOrScalar: - Column - Scalar cpdef Column find( Column input, diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd new file mode 100644 index 00000000000..d6c6da78399 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar +from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar + + +cpdef Column replace( + Column input, + ColumnOrScalar target, + ColumnOrScalar repl, + size_type maxrepl = * +) +cpdef Column replace_slice( + Column input, + Scalar repl = *, + size_type start = *, + size_type stop = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx new file mode 100644 index 00000000000..aee4ebab1eb --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( + replace as cpp_replace, + replace_slice as cpp_replace_slice, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar +from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar + + +cpdef Column replace( + Column input, + ColumnOrScalar target, + ColumnOrScalar repl, + size_type maxrepl = -1 +): + cdef: + unique_ptr[column] c_result + const string_scalar* target_str + const string_scalar* repl_str + + if ColumnOrScalar is Scalar: + target_str = (target.c_obj.get()) + repl_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace( + input.view(), + target_str[0], + repl_str[0], + maxrepl, + )) + else: + # Column case + # TODO: maxrepl should be supported in the corresponding CUDA/C++ code + with nogil: + c_result = move(cpp_replace( + input.view(), + target.view(), + repl.view(), + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_slice( + Column input, + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 + Scalar repl = None, + size_type start = 0, + size_type stop = -1 +): + + cdef unique_ptr[column] c_result + + cdef const string_scalar* scalar_str + + if repl is None: + repl = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + scalar_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace_slice( + input.view(), + scalar_str[0], + start, + stop + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd new file mode 100644 index 00000000000..1cc129f6e1a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 880201e65a2..3b4ed5b1a54 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -1,22 +1,15 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar -from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( - replace as cpp_replace, - replace_slice as cpp_replace_slice, -) from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_replace(Column source_strings, @@ -31,22 +24,12 @@ def slice_replace(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - stop - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + stop + )) @acquire_spill_lock() @@ -60,22 +43,12 @@ def insert(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - start - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + start, + )) @acquire_spill_lock() @@ -91,25 +64,12 @@ def replace(Column source_strings, cdef DeviceScalar target = py_target.device_value cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_target = ( - target.get_raw_ptr() - ) - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace( - source_view, - scalar_target[0], - scalar_repl[0], - maxrepl - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace( + source_strings.to_pylibcudf(mode="read"), + target.c_value, + repl.c_value, + maxrepl + )) @acquire_spill_lock() @@ -120,16 +80,8 @@ def replace_multi(Column source_strings, Returns a Column after replacing occurrences of patterns `target_strings` with `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - cdef column_view repl_view = repl_strings.view() - - with nogil: - c_result = move(cpp_replace( - source_view, - target_view, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read"), + repl_strings.to_pylibcudf(mode="read"), + )) From 397ba14fb0319a7dc92160295459ce9f8ec4979c Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 23 May 2024 12:00:20 -0700 Subject: [PATCH 2/9] add tests --- .../pylibcudf_tests/test_string_replace.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_replace.py diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py new file mode 100644 index 00000000000..537325af1a3 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -0,0 +1,121 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_data_col(): + return pa.array(["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"]) + + +@pytest.fixture(scope="module") +def plc_data_col(pa_data_col): + return plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"]) +def scalar_repl_target(request): + pa_target = pa.scalar(request.param, type=pa.string()) + return (request.param, plc.interop.from_arrow(pa_target)) + + +@pytest.fixture(scope="module", params=["b", "B", "", "B́"]) +def scalar_repl(request): + pa_repl = pa.scalar(request.param, type=pa.string()) + return (request.param, plc.interop.from_arrow(pa_repl)) + + +@pytest.fixture( + scope="module", + params=[ + ["a", "c", "A", "ÁÁÁÁ"], + ], +) +def col_repl_target(request): + pa_target = pa.array(request.param, type=pa.string()) + return (pa_target, plc.interop.from_arrow(pa_target)) + + +@pytest.fixture( + scope="module", + params=[ + [ + "", + "z", + "XX", + "blahblah", + ] + ], +) +def col_repl(request): + pa_repl = pa.array(request.param, type=pa.string()) + return (pa_repl, plc.interop.from_arrow(pa_repl)) + + +@pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10]) +def test_replace( + pa_data_col, plc_data_col, scalar_repl_target, scalar_repl, maxrepl +): + pa_target, plc_target = scalar_repl_target + pa_repl, plc_repl = scalar_repl + got = plc.strings.replace.replace( + plc_data_col, plc_target, plc_repl, maxrepl + ) + + expected = pa.compute.replace_substring( + pa_data_col, + pattern=pa_target, + replacement=pa_repl, + max_replacements=maxrepl, + ) + + assert_column_eq(got, expected) + + +@pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)]) +def test_replace_slice(pa_data_col, plc_data_col, scalar_repl, startstop): + pa_repl, plc_repl = scalar_repl + start, stop = startstop + got = plc.strings.replace.replace_slice( + plc_data_col, plc_repl, start, stop + ) + + if stop == -1: + # pyarrow doesn't support -1 as stop, so just set to really big number + stop = 1000 + + expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl) + + assert_column_eq(got, expected) + + +def test_replace_col(pa_data_col, plc_data_col, col_repl_target, col_repl): + pa_target, plc_target = col_repl_target + pa_repl, plc_repl = col_repl + got = plc.strings.replace.replace(plc_data_col, plc_target, plc_repl) + + # There's nothing in pyarrow that does string replace with columns + # for targets/repls, so let's implement our own in python + + def replace_list(elem, targets, repls): + for target, repl in zip(targets, repls): + res = elem.replace(target, repl) + if res != elem: + return res + + targets = pa_target.to_pylist() + repls = pa_repl.to_pylist() + + expected = pa.array( + [ + replace_list(elem, targets, repls) if elem is not None else None + for elem in pa_data_col.to_pylist() + ], + type=pa.string(), + ) + + assert_column_eq(got, expected) From 4d3b40fa966a528cee7080a3a28986c89ae0af01 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 24 May 2024 10:50:56 -0700 Subject: [PATCH 3/9] update --- .../cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx | 10 ++++++---- .../cudf/cudf/pylibcudf_tests/test_string_replace.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index aceadbf574e..8e3ab9d0869 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -17,5 +17,5 @@ set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings ASSOCIATED_TARGETS cudf + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx index aee4ebab1eb..02c1a94402f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -42,7 +42,11 @@ cpdef Column replace( )) else: # Column case - # TODO: maxrepl should be supported in the corresponding CUDA/C++ code + + if maxrepl != -1: + raise ValueError("maxrepl is not supported as a valid " + "argument when target and repl are Columns") + with nogil: c_result = move(cpp_replace( input.view(), @@ -64,14 +68,12 @@ cpdef Column replace_slice( cdef unique_ptr[column] c_result - cdef const string_scalar* scalar_str - if repl is None: repl = Scalar.from_libcudf( cpp_make_string_scalar("".encode()) ) - scalar_str = (repl.c_obj.get()) + cdef const string_scalar* scalar_str = (repl.c_obj.get()) with nogil: c_result = move(cpp_replace_slice( diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py index 537325af1a3..7f04465ba55 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -119,3 +119,15 @@ def replace_list(elem, targets, repls): ) assert_column_eq(got, expected) + + +def test_replace_col_invalid_arg(plc_data_col, col_repl_target, col_repl): + _, plc_target = col_repl_target + _, plc_repl = col_repl + with pytest.raises( + ValueError, + match="maxrepl is not supported as a valid argument when target and repl are Columns", + ): + plc.strings.replace.replace( + plc_data_col, plc_target, plc_repl, maxrepl=10 + ) From 0924bd2654b8e93124fc674ec54b711c2237c8c3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 24 May 2024 13:06:55 -0700 Subject: [PATCH 4/9] rest of feedback --- python/cudf/cudf/_lib/pylibcudf/strings/find.pxd | 5 ++++- python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd | 4 +++- python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/strings/types.pxd | 8 -------- python/cudf/cudf/pylibcudf_tests/test_string_replace.py | 3 +++ 5 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/types.pxd diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd index 69ecb3e1f67..a36106d2e82 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd @@ -3,7 +3,10 @@ from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.pylibcudf.scalar cimport Scalar -from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar + +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column find( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd index d6c6da78399..b34620cf505 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd @@ -3,8 +3,10 @@ from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.pylibcudf.scalar cimport Scalar -from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column replace( Column input, diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx index 02c1a94402f..1166fdf727b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -15,7 +15,7 @@ from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( ) from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.pylibcudf.scalar cimport Scalar -from cudf._lib.pylibcudf.strings.types cimport ColumnOrScalar +from cudf._lib.pylibcudf.strings.replace cimport ColumnOrScalar cpdef Column replace( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd deleted file mode 100644 index 1cc129f6e1a..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/strings/types.pxd +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from cudf._lib.pylibcudf.column cimport Column -from cudf._lib.pylibcudf.scalar cimport Scalar - -ctypedef fused ColumnOrScalar: - Column - Scalar diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py index 7f04465ba55..c0f50b324fe 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -86,6 +86,9 @@ def test_replace_slice(pa_data_col, plc_data_col, scalar_repl, startstop): if stop == -1: # pyarrow doesn't support -1 as stop, so just set to really big number + + # TODO: once libcudf's count_characters() is migrated, we can call + # count_characters on the input, take the max and set stop to that stop = 1000 expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl) From 139a2e2988d9c2e9237de7e737f6fd4ed24f1fa4 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 29 May 2024 09:53:26 -0700 Subject: [PATCH 5/9] add docstrings for replace --- .../cudf/cudf/_lib/pylibcudf/strings/find.pxd | 1 - .../cudf/_lib/pylibcudf/strings/replace.pyx | 59 ++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd index a36106d2e82..bb43069f190 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd @@ -8,7 +8,6 @@ ctypedef fused ColumnOrScalar: Column Scalar - cpdef Column find( Column input, ColumnOrScalar target, diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx index 1166fdf727b..e5d647b97d3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -24,6 +24,38 @@ cpdef Column replace( ColumnOrScalar repl, size_type maxrepl = -1 ): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace`. + + Parameters + ---------- + input : Column + The input strings + target : Union[Column, Scalar] + String to search for in each string or Column containing strings + to search for in the input column. + + If target is a Column, repl must also be a Column. + repl : Union[Column, Scalar] + String to replace target with, or Column (of equal length to target) + of replacement strings. + + If repl is a Column, target must also be a Column. + maxrepl : size_type, default -1 + Maximum times to replace if target appears multiple times in the input string. + Default of -1 specifies replace all occurrences of target in each string. + + This option is not supported when target and repl are of type Column. + (all occurrences of target will always be replaced in that case) + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ cdef: unique_ptr[column] c_result const string_scalar* target_str @@ -65,7 +97,32 @@ cpdef Column replace_slice( size_type start = 0, size_type stop = -1 ): - + """Replaces each string in the column with the provided repl string + within the [start,stop) character position range. + + Null string entries will return null output string entries. + This function can be used to insert a string into specific position + by specifying the same position value for start and stop. + The repl string can be appended to each string by specifying -1 + for both start and stop. + + For details, see :cpp:func:`replace_slice`. + + Parameters + ---------- + input : Column + The input strings + repl : Scalar, default "" + String scalar to replace target with. + start : size_type, default 0 + Start position where repl will be added. + stop : size_type, default -1 + End position (exclusive) to use for replacement. + Returns + ------- + pylibcudf.Column + New string column + """ cdef unique_ptr[column] c_result if repl is None: From cac8be11dc727a9370c8635d1e9b572b92e79b5b Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 3 Jun 2024 22:02:05 +0000 Subject: [PATCH 6/9] change module name in strings --- .../source/user_guide/api_docs/pylibcudf/strings/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index d943a697a44..8970fc80c0b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -1,5 +1,5 @@ -String Functions -================ +strings +======= .. toctree:: :maxdepth: 1 From 4583f5cf33280da55987a6e9354ab27f192d1495 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 14:14:30 +0000 Subject: [PATCH 7/9] fix errors --- python/cudf/cudf/_lib/strings/replace.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 81c5a54b821..374831f1833 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -80,7 +80,7 @@ def replace_multi(Column source_strings, Returns a Column after replacing occurrences of patterns `target_strings` with `repl_strings` in `source_strings`. """ - return Column.from_pylibcudf(plc.strings.replace.replace_multi( + return Column.from_pylibcudf(plc.strings.replace.replace_multiple( source_strings.to_pylibcudf(mode="read"), target_strings.to_pylibcudf(mode="read"), repl_strings.to_pylibcudf(mode="read"), From b589e2534ff01c4ce064ab1c6ef2d5317a5b5265 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 08:23:15 -0700 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst | 2 +- python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e696d8147f4..b6ad1157511 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -35,6 +35,6 @@ This page provides API documentation for pylibcudf. .. toctree:: :maxdepth: 2 - :caption: Submodules + :caption: Subpackages strings/index.rst diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx index ad955ca344b..c757150a600 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -40,7 +40,7 @@ cpdef Column replace( String to replace target with. maxrepl : size_type, default -1 Maximum times to replace if target appears multiple times in the input string. - Default of -1 specifies replace all occurrences of target in each string. + Default of -1 specifies to replace all occurrences of target in each string. Returns ------- From 65ae3d8f24bf778efa81334125953b28a863bec4 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 15:28:24 +0000 Subject: [PATCH 9/9] small test adjustment --- .../pylibcudf_tests/test_string_replace.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py index 6b3100b2d79..f20edf6a506 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -8,25 +8,24 @@ @pytest.fixture(scope="module") -def pa_data_col(): - return pa.array(["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"]) - - -@pytest.fixture(scope="module") -def plc_data_col(pa_data_col): - return plc.interop.from_arrow(pa_data_col) +def data_col(): + pa_data_col = pa.array( + ["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) @pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"]) def scalar_repl_target(request): pa_target = pa.scalar(request.param, type=pa.string()) - return (request.param, plc.interop.from_arrow(pa_target)) + return request.param, plc.interop.from_arrow(pa_target) @pytest.fixture(scope="module", params=["b", "B", "", "B́"]) def scalar_repl(request): pa_repl = pa.scalar(request.param, type=pa.string()) - return (request.param, plc.interop.from_arrow(pa_repl)) + return request.param, plc.interop.from_arrow(pa_repl) @pytest.fixture( @@ -57,9 +56,8 @@ def col_repl(request): @pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10]) -def test_replace( - pa_data_col, plc_data_col, scalar_repl_target, scalar_repl, maxrepl -): +def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl): + pa_data_col, plc_data_col = data_col pa_target, plc_target = scalar_repl_target pa_repl, plc_repl = scalar_repl got = plc.strings.replace.replace( @@ -77,7 +75,8 @@ def test_replace( @pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)]) -def test_replace_slice(pa_data_col, plc_data_col, scalar_repl, startstop): +def test_replace_slice(data_col, scalar_repl, startstop): + pa_data_col, plc_data_col = data_col pa_repl, plc_repl = scalar_repl start, stop = startstop got = plc.strings.replace.replace_slice( @@ -96,7 +95,8 @@ def test_replace_slice(pa_data_col, plc_data_col, scalar_repl, startstop): assert_column_eq(expected, got) -def test_replace_col(pa_data_col, plc_data_col, col_repl_target, col_repl): +def test_replace_col(data_col, col_repl_target, col_repl): + pa_data_col, plc_data_col = data_col pa_target, plc_target = col_repl_target pa_repl, plc_repl = col_repl got = plc.strings.replace.replace_multiple(