From e0b8ab01deb66ea0726373ecb9cc77cbbf0666cd Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 26 Jun 2024 11:00:25 -0500 Subject: [PATCH] Migrate string `slice` APIs to `pylibcudf` (#15988) This PR introduces pylibcudf string `slice` APIs and migrates the cuDF cython to use them. Part of https://github.com/rapidsai/cudf/issues/15162 Authors: - https://github.com/brandon-b-miller - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15988 --- .../api_docs/pylibcudf/strings/index.rst | 1 + .../api_docs/pylibcudf/strings/slice.rst | 6 + .../libcudf/scalar/scalar_factories.pxd | 1 + .../_lib/pylibcudf/strings/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 1 + .../cudf/_lib/pylibcudf/strings/__init__.py | 1 + .../cudf/_lib/pylibcudf/strings/slice.pxd | 15 +++ .../cudf/_lib/pylibcudf/strings/slice.pyx | 102 +++++++++++++++ python/cudf/cudf/_lib/strings/substring.pyx | 88 ++++--------- .../cudf/pylibcudf_tests/test_string_slice.py | 116 ++++++++++++++++++ 10 files changed, 270 insertions(+), 63 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_slice.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index bfaef732555..cecf1ccc9bb 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,3 +6,4 @@ strings contains replace + slice diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst new file mode 100644 index 00000000000..0ee5af71c03 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst @@ -0,0 +1,6 @@ +===== +slice +===== + +.. automodule:: cudf._lib.pylibcudf.strings.slice + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd index 5c4e5bf346f..c8220df8938 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + + cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index cb7f71b1912..b499a127541 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx + regex_program.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 959aa94737d..d1f632d6d8e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -9,4 +9,5 @@ from . cimport ( regex_flags, regex_program, replace, + slice, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index b7384913286..ef102aff2af 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -9,4 +9,5 @@ regex_flags, regex_program, replace, + slice, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd new file mode 100644 index 00000000000..7d8d0006ef4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column slice_strings( + Column input, + ColumnOrScalar start=*, + ColumnOrScalar stop=*, + Scalar step=* +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx new file mode 100644 index 00000000000..df75134fb71 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -0,0 +1,102 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_fixed_width_scalar as cpp_make_fixed_width_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column slice_strings( + Column input, + ColumnOrScalar start=None, + ColumnOrScalar stop=None, + Scalar step=None +): + """Perform a slice operation on a strings column. + + ``start`` and ``stop`` may be a + :py:class:`~cudf._lib.pylibcudf.column.Column` or a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. + + For details, see :cpp:func:`cudf::strings::slice_strings`. + + Parameters + ---------- + input : Column + Strings column for this operation + start : Union[Column, Scalar] + The start character position or positions. + stop : Union[Column, Scalar] + The end character position or positions + step : Scalar + Distance between input characters retrieved + + Returns + ------- + pylibcudf.Column + The result of the slice operation + """ + cdef unique_ptr[column] c_result + cdef numeric_scalar[size_type]* cpp_start + cdef numeric_scalar[size_type]* cpp_stop + cdef numeric_scalar[size_type]* cpp_step + + if input is None: + raise ValueError("input cannot be None") + + if ColumnOrScalar is Column: + if step is not None: + raise ValueError("Column-wise slice does not support step") + + if start is None or stop is None: + raise ValueError( + "start and stop must be provided for Column-wise slice" + ) + + with nogil: + c_result = cpp_slice.slice_strings( + input.view(), + start.view(), + stop.view() + ) + + elif ColumnOrScalar is Scalar: + if start is None: + start = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(0) + ) + if stop is None: + stop = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(0) + ) + if step is None: + step = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(1) + ) + + cpp_start = start.c_obj.get() + cpp_stop = stop.c_obj.get() + cpp_step = step.c_obj.get() + + with nogil: + c_result = cpp_slice.slice_strings( + input.view(), + dereference(cpp_start), + dereference(cpp_stop), + dereference(cpp_step) + ) + else: + raise ValueError("start, stop, and step must be either Column or Scalar") + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx index 170c1016b89..706c21c0634 100644 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ b/python/cudf/cudf/_lib/strings/substring.pyx @@ -2,24 +2,16 @@ import numpy as np -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.strings.substring cimport ( - slice_strings as cpp_slice_strings, -) -from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar import as_device_scalar -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_strings(Column source_strings, @@ -32,30 +24,18 @@ def slice_strings(Column source_strings, performed in steps by skipping `step` number of characters in a string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - cdef numeric_scalar[size_type]* start_numeric_scalar = \ - ( - start_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* end_numeric_scalar = \ - (end_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* step_numeric_scalar = \ - (step_scalar.get_raw_ptr()) - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - start_numeric_scalar[0], - end_numeric_scalar[0], - step_numeric_scalar[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + start_scalar.c_value, + end_scalar.c_value, + step_scalar.c_value + ) + ) @acquire_spill_lock() @@ -67,19 +47,13 @@ def slice_from(Column source_strings, at given starts and stops positions. `starts` and `stops` here are positions per element in the string-column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view starts_view = starts.view() - cdef column_view stops_view = stops.view() - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - starts_view, - stops_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + starts.to_pylibcudf(mode="read"), + stops.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() @@ -90,8 +64,7 @@ def get(Column source_strings, character from each input string. The index of characters required can be controlled by passing `index`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() + if index < 0: next_index = index - 1 step = -1 @@ -102,20 +75,11 @@ def get(Column source_strings, cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - cdef numeric_scalar[size_type]* start_numeric_scalar = \ - ( - start_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* end_numeric_scalar = \ - (end_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* step_numeric_scalar = \ - (step_scalar.get_raw_ptr()) - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - start_numeric_scalar[0], - end_numeric_scalar[0], - step_numeric_scalar[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + start_scalar.c_value, + end_scalar.c_value, + step_scalar.c_value + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py new file mode 100644 index 00000000000..bd63987b30f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_col(): + return pa.array(["AbC", "123abc", "", " ", None]) + + +@pytest.fixture(scope="module") +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) + + +@pytest.fixture( + scope="module", + params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)], +) +def pa_start_stop_step(request): + return tuple(pa.scalar(x, type=pa.int32()) for x in request.param) + + +@pytest.fixture(scope="module") +def plc_start_stop_step(pa_start_stop_step): + return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step) + + +@pytest.fixture(scope="module") +def pa_starts_col(): + return pa.array([0, 1, 3, -1, 100]) + + +@pytest.fixture(scope="module") +def plc_starts_col(pa_starts_col): + return plc.interop.from_arrow(pa_starts_col) + + +@pytest.fixture(scope="module") +def pa_stops_col(): + return pa.array([1, 3, 4, -1, 100]) + + +@pytest.fixture(scope="module") +def plc_stops_col(pa_stops_col): + return plc.interop.from_arrow(pa_stops_col) + + +def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step): + pa_start, pa_stop, pa_step = pa_start_stop_step + plc_start, plc_stop, plc_step = plc_start_stop_step + + def slice_string(st, start, stop, step): + return st[start:stop:step] if st is not None else None + + expected = pa.array( + [ + slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py()) + for x in pa_col.to_pylist() + ], + type=pa.string(), + ) + + got = plc.strings.slice.slice_strings( + plc_col, start=plc_start, stop=plc_stop, step=plc_step + ) + + assert_column_eq(expected, got) + + +def test_slice_column( + pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col +): + def slice_string(st, start, stop): + if stop < 0: + stop = len(st) + return st[start:stop] if st is not None else None + + expected = pa.array( + [ + slice_string(x, start, stop) + for x, start, stop in zip( + pa_col.to_pylist(), + pa_starts_col.to_pylist(), + pa_stops_col.to_pylist(), + ) + ], + type=pa.string(), + ) + + got = plc.strings.slice.slice_strings( + plc_col, plc_starts_col, plc_stops_col + ) + + assert_column_eq(expected, got) + + +def test_slice_invalid(plc_col, plc_starts_col, plc_stops_col): + with pytest.raises(TypeError): + # no maching signature + plc.strings.slice.slice_strings(None, pa_starts_col, pa_stops_col) + with pytest.raises(ValueError): + # signature found but wrong value passed + plc.strings.slice.slice_strings(plc_col, plc_starts_col, None) + with pytest.raises(TypeError): + # no matching signature (2nd arg) + plc.strings.slice.slice_strings(plc_col, None, plc_stops_col) + with pytest.raises(TypeError): + # can't provide step for columnwise api + plc.strings.slice.slice_strings( + plc_col, plc_starts_col, plc_stops_col, plc_starts_col + )