Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate string slice APIs to pylibcudf #15988

Merged
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ strings

contains
replace
slice
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
slice
=====

.. automodule:: cudf._lib.pylibcudf.strings.slice
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar

cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx replace.pyx
regex_program.pyx replace.pyx slice.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ from . cimport (
regex_flags,
regex_program,
replace,
slice,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
regex_flags,
regex_program,
replace,
slice,
)
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column slice_strings(
Column input,
ColumnOrScalar start=*,
ColumnOrScalar stop=*,
Scalar step=*
)
94 changes: 94 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_fixed_width_scalar as cpp_make_fixed_width_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking this PR, but wonder if it makes sense to align the libcudf/pylibcudf structure on either substring or slice rather than using both interchangeably (though don't have much context with strings submodules to know if this was an intentional choice)

from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column slice_strings(
Column input,
wence- marked this conversation as resolved.
Show resolved Hide resolved
ColumnOrScalar start=None,
ColumnOrScalar stop=None,
wence- marked this conversation as resolved.
Show resolved Hide resolved
Scalar step=None
):
"""Perform a slice operation on a strings column.

``start`` and ``stop`` may be a
:py:class:`~cudf._lib.pylibcudf.column.Column` or a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.

For details, see :cpp:func:`cudf::strings::slice_strings`.

Parameters
----------
input : Column
Strings column for this operation
start : Union[Column, Scalar]
The start character position or positions.
stop : Union[Column, Scalar]
The end character position or positions
step : Scalar
Distance between input characters retrieved

Returns
-------
pylibcudf.Column
The result of the slice operation
"""
cdef unique_ptr[column] c_result
cdef numeric_scalar[size_type]* cpp_start
cdef numeric_scalar[size_type]* cpp_stop
cdef numeric_scalar[size_type]* cpp_step

if ColumnOrScalar is Column:
if step is not None:
raise ValueError("Column-wise slice does not support step")

with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
start.view(),
stop.view()
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
)

elif ColumnOrScalar is Scalar:
if start is None:
start = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if stop is None:
stop = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if step is None:
step = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(1)
)

cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()

brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
dereference(cpp_start),
dereference(cpp_stop),
dereference(cpp_step)
)
else:
raise ValueError("start, stop, and step must be either Column or Scalar")

return Column.from_libcudf(move(c_result))
88 changes: 26 additions & 62 deletions python/cudf/cudf/_lib/strings/substring.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,16 @@

import numpy as np

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
slice_strings as cpp_slice_strings,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type

from cudf._lib.scalar import as_device_scalar

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def slice_strings(Column source_strings,
Expand All @@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
performed in steps by skipping `step` number of
characters in a string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)


@acquire_spill_lock()
Expand All @@ -67,19 +47,13 @@ def slice_from(Column source_strings,
at given starts and stops positions. `starts` and `stops`
here are positions per element in the string-column.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view starts_view = starts.view()
cdef column_view stops_view = stops.view()

with nogil:
c_result = move(cpp_slice_strings(
source_view,
starts_view,
stops_view
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
starts.to_pylibcudf(mode="read"),
stops.to_pylibcudf(mode="read")
)
)


@acquire_spill_lock()
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -90,8 +64,7 @@ def get(Column source_strings,
character from each input string. The index of
characters required can be controlled by passing `index`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

if index < 0:
next_index = index - 1
step = -1
Expand All @@ -102,20 +75,11 @@ def get(Column source_strings,
cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)
99 changes: 99 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_slice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_col():
return pa.array(["AbC", "123abc", "", " ", None])


@pytest.fixture(scope="module")
def plc_col(pa_col):
return plc.interop.from_arrow(pa_col)


@pytest.fixture(
scope="module",
params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)],
)
def pa_start_stop_step(request):
return tuple(pa.scalar(x, type=pa.int32()) for x in request.param)


@pytest.fixture(scope="module")
def plc_start_stop_step(pa_start_stop_step):
return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step)


@pytest.fixture(scope="module")
def pa_starts_col():
return pa.array([0, 1, 3, -1, 100])


@pytest.fixture(scope="module")
def plc_starts_col(pa_starts_col):
return plc.interop.from_arrow(pa_starts_col)


@pytest.fixture(scope="module")
def pa_stops_col():
return pa.array([1, 3, 4, -1, 100])


@pytest.fixture(scope="module")
def plc_stops_col(pa_stops_col):
return plc.interop.from_arrow(pa_stops_col)


def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step):
pa_start, pa_stop, pa_step = pa_start_stop_step
plc_start, plc_stop, plc_step = plc_start_stop_step

def slice_string(st, start, stop, step):
return st[start:stop:step] if st is not None else None

expected = pa.array(
[
slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py())
for x in pa_col.to_pylist()
],
type=pa.string(),
)

got = plc.strings.slice.slice_strings(
plc_col, start=plc_start, stop=plc_stop, step=plc_step
)

assert_column_eq(expected, got)


def test_slice_column(
pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col
):
def slice_string(st, start, stop):
if stop < 0:
stop = len(st)
return st[start:stop] if st is not None else None

expected = pa.array(
[
slice_string(x, start, stop)
for x, start, stop in zip(
pa_col.to_pylist(),
pa_starts_col.to_pylist(),
pa_stops_col.to_pylist(),
)
],
type=pa.string(),
)

got = plc.strings.slice.slice_strings(
plc_col, plc_starts_col, plc_stops_col
)

assert_column_eq(expected, got)
Loading