Skip to content

Commit

Permalink
Migrate string slice APIs to pylibcudf (#15988)
Browse files Browse the repository at this point in the history
This PR introduces pylibcudf string `slice` APIs and migrates the cuDF cython to use them. Part of #15162

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15988
  • Loading branch information
brandon-b-miller authored Jun 26, 2024
1 parent bfaddd3 commit e0b8ab0
Show file tree
Hide file tree
Showing 10 changed files with 270 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ strings

contains
replace
slice
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
slice
=====

.. automodule:: cudf._lib.pylibcudf.strings.slice
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar

cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx replace.pyx
regex_program.pyx replace.pyx slice.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ from . cimport (
regex_flags,
regex_program,
replace,
slice,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
regex_flags,
regex_program,
replace,
slice,
)
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column slice_strings(
Column input,
ColumnOrScalar start=*,
ColumnOrScalar stop=*,
Scalar step=*
)
102 changes: 102 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_fixed_width_scalar as cpp_make_fixed_width_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column slice_strings(
Column input,
ColumnOrScalar start=None,
ColumnOrScalar stop=None,
Scalar step=None
):
"""Perform a slice operation on a strings column.
``start`` and ``stop`` may be a
:py:class:`~cudf._lib.pylibcudf.column.Column` or a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
For details, see :cpp:func:`cudf::strings::slice_strings`.
Parameters
----------
input : Column
Strings column for this operation
start : Union[Column, Scalar]
The start character position or positions.
stop : Union[Column, Scalar]
The end character position or positions
step : Scalar
Distance between input characters retrieved
Returns
-------
pylibcudf.Column
The result of the slice operation
"""
cdef unique_ptr[column] c_result
cdef numeric_scalar[size_type]* cpp_start
cdef numeric_scalar[size_type]* cpp_stop
cdef numeric_scalar[size_type]* cpp_step

if input is None:
raise ValueError("input cannot be None")

if ColumnOrScalar is Column:
if step is not None:
raise ValueError("Column-wise slice does not support step")

if start is None or stop is None:
raise ValueError(
"start and stop must be provided for Column-wise slice"
)

with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
start.view(),
stop.view()
)

elif ColumnOrScalar is Scalar:
if start is None:
start = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if stop is None:
stop = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if step is None:
step = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(1)
)

cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()

with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
dereference(cpp_start),
dereference(cpp_stop),
dereference(cpp_step)
)
else:
raise ValueError("start, stop, and step must be either Column or Scalar")

return Column.from_libcudf(move(c_result))
88 changes: 26 additions & 62 deletions python/cudf/cudf/_lib/strings/substring.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,16 @@

import numpy as np

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
slice_strings as cpp_slice_strings,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type

from cudf._lib.scalar import as_device_scalar

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def slice_strings(Column source_strings,
Expand All @@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
performed in steps by skipping `step` number of
characters in a string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)


@acquire_spill_lock()
Expand All @@ -67,19 +47,13 @@ def slice_from(Column source_strings,
at given starts and stops positions. `starts` and `stops`
here are positions per element in the string-column.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view starts_view = starts.view()
cdef column_view stops_view = stops.view()

with nogil:
c_result = move(cpp_slice_strings(
source_view,
starts_view,
stops_view
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
starts.to_pylibcudf(mode="read"),
stops.to_pylibcudf(mode="read")
)
)


@acquire_spill_lock()
Expand All @@ -90,8 +64,7 @@ def get(Column source_strings,
character from each input string. The index of
characters required can be controlled by passing `index`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

if index < 0:
next_index = index - 1
step = -1
Expand All @@ -102,20 +75,11 @@ def get(Column source_strings,
cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)
Loading

0 comments on commit e0b8ab0

Please sign in to comment.