Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plumb pylibcudf strings contains_re through cudf_polars #15918

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
05d1eb4
initial
brandon-b-miller May 28, 2024
004ae1d
Merge branch 'branch-24.08' into pylibcudf-strings-contains
brandon-b-miller May 28, 2024
9a3f19d
merge/resolve
brandon-b-miller May 29, 2024
87341d4
one test
brandon-b-miller May 29, 2024
6d50191
tests, fixes
brandon-b-miller May 29, 2024
e35cd9a
declaration
brandon-b-miller May 29, 2024
69ad703
Merge branch 'branch-24.08' into pylibcudf-strings-contains
brandon-b-miller May 30, 2024
83178c9
docs, style
brandon-b-miller May 31, 2024
758755c
type create more strongly
brandon-b-miller May 31, 2024
98aeefa
add more tests
brandon-b-miller May 31, 2024
936e412
style
brandon-b-miller May 31, 2024
b15588a
regex program tests
brandon-b-miller May 31, 2024
b5a68c5
Merge branch 'branch-24.08' into pylibcudf-strings-contains
brandon-b-miller Jun 3, 2024
4b6a393
polars contains_re plumbing
brandon-b-miller Jun 4, 2024
6c125cb
refactor expr
brandon-b-miller Jun 5, 2024
9fb3a2b
add tests for invalid regex
brandon-b-miller Jun 5, 2024
0463688
merge latest/resolve conflicts
brandon-b-miller Jun 6, 2024
7543726
cleanup
brandon-b-miller Jun 6, 2024
42b158f
Address reviews
brandon-b-miller Jun 6, 2024
39b57ca
merge latest/resolve conflicts
brandon-b-miller Jun 10, 2024
e3fb170
refactor logic
brandon-b-miller Jun 10, 2024
da08309
merge latest/resolve
brandon-b-miller Jun 12, 2024
e45fbed
add literal column tests, support it, refactor logic
brandon-b-miller Jun 12, 2024
22e1031
add tests, refactor
brandon-b-miller Jun 12, 2024
4b643a7
pacify mypy
brandon-b-miller Jun 12, 2024
5533e5b
Make type-narrowing a no-op if run with `-O`
wence- Jun 13, 2024
ee42757
Merge branch 'branch-24.08' into cudf-polars-str-contains
wence- Jun 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/contains.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
========
Contains
========

.. automodule:: cudf._lib.pylibcudf.strings.contains
:members:
9 changes: 9 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,12 @@ This page provides API documentation for pylibcudf.
table
types
unary

String Functions
================

.. toctree::
:maxdepth: 1
:caption: String Functions

contains
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx)
set(cython_sources char_types.pyx regex_flags.pyx)

set(linked_libraries cudf::cudf)

Expand Down
13 changes: 8 additions & 5 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t


cdef extern from "cudf/strings/regex/flags.hpp" \
namespace "cudf::strings" nogil:

ctypedef enum regex_flags:
DEFAULT 'cudf::strings::regex_flags::DEFAULT'
MULTILINE 'cudf::strings::regex_flags::MULTILINE'
DOTALL 'cudf::strings::regex_flags::DOTALL'
cpdef enum class regex_flags(int32_t):
DEFAULT
MULTILINE
DOTALL
Empty file.
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
set(cython_sources case.pyx capitalize.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx
)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport capitalize, case, char_types, find
from . cimport (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import capitalize, case, char_types, find
from . import (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram


cpdef Column contains_re(Column input, RegexProgram prog)
41 changes: 41 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram


cpdef Column contains_re(
Column input,
RegexProgram prog
):
"""Returns a boolean column identifying rows which match the given
regex_program object.

For details, see :cpp:func:`cudf::strings::contains_re`.

Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance

Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""

cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
)

return Column.from_libcudf(move(result))
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
regex_flags as RegexFlags # no-cython-lint
10 changes: 10 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program


cdef class RegexProgram:
cdef unique_ptr[regex_program] c_obj
37 changes: 37 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.


from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags


cdef class RegexProgram:

def __init__(self, *args, **kwargs):
raise ValueError("Do not instantiate RegexProgram directly, use create")

@staticmethod
def create(str pattern, regex_flags flags):
cdef unique_ptr[regex_program] c_prog
cdef regex_flags c_flags
cdef string c_pattern = pattern.encode()

cdef RegexProgram ret = RegexProgram.__new__(RegexProgram)
if isinstance(flags, object):
if isinstance(flags, (int, RegexFlags)):
c_flags = flags
with nogil:
c_prog = regex_program.create(c_pattern, c_flags)

ret.c_obj = move(c_prog)
else:
raise ValueError("flags must be of type RegexFlags")

return ret
23 changes: 7 additions & 16 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
contains_re as cpp_contains_re,
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
Expand All @@ -23,28 +22,20 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.pylibcudf.strings import contains
from cudf._lib.pylibcudf.strings.regex_program import RegexProgram


@acquire_spill_lock()
def contains_re(Column source_strings, object reg_ex, uint32_t flags):
"""
Returns a Column of boolean values with True for `source_strings`
that contain regular expression `reg_ex`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_contains_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_regex_program.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pytest

import cudf._lib.pylibcudf as plc


@pytest.mark.parametrize("pat", ["(", "*", "\\"])
def test_regex_program_invalid(pat):
with pytest.raises(RuntimeError):
plc.strings.regex_program.RegexProgram.create(
pat, plc.strings.regex_flags.RegexFlags.DEFAULT
)
55 changes: 55 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_target_col():
return pa.array(
["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
)


@pytest.fixture(scope="module")
def plc_target_col(pa_target_col):
return plc.interop.from_arrow(pa_target_col)


@pytest.fixture(
params=[
"A",
"de",
".*",
"^a",
"^A",
"[^a-z]",
"[a-z]{3,}",
"^[A-Z]{2,}",
"j|u",
],
scope="module",
)
def pa_target_scalar(request):
return pa.scalar(request.param, type=pa.string())


@pytest.fixture(scope="module")
def plc_target_pat(pa_target_scalar):
prog = plc.strings.regex_program.RegexProgram.create(
pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT
)
return prog


def test_contains_re(
pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
):
got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
expected = pa.compute.match_substring_regex(
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)
26 changes: 26 additions & 0 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,13 +577,23 @@ def __init__(
self.options = options
self.name = name
self.children = children
self._validate_input()

def _validate_input(self):
if self.name not in (
pl_expr.StringFunction.Lowercase,
pl_expr.StringFunction.Uppercase,
pl_expr.StringFunction.EndsWith,
pl_expr.StringFunction.StartsWith,
pl_expr.StringFunction.Contains,
):
raise NotImplementedError(f"String function {self.name}")
if self.name == pl_expr.StringFunction.Contains:
_, strict = self.options
if not strict:
raise NotImplementedError("strict=False not supported in contains")
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(self.children[1], Literal):
raise NotImplementedError("contains pattern must be a single scalar")

def do_evaluate(
self,
Expand Down Expand Up @@ -613,6 +623,22 @@ def do_evaluate(
return Column(
plc.strings.find.starts_with(column.obj, suffix.obj), column.name
)
elif self.name == pl_expr.StringFunction.Contains:
column, pattern = columns
literal, _ = self.options
if literal:
return Column(
plc.strings.find.contains(column.obj, pattern.obj), column.name
)
else:
# TODO: hack
pattern = plc.interop.to_arrow(pattern.obj).as_py()
prog = plc.strings.regex_program.RegexProgram.create(
pattern, flags=plc.strings.regex_flags.RegexFlags.DEFAULT
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
)
return Column(
plc.strings.contains.contains_re(column.obj, prog), column.name
)
else:
raise NotImplementedError(f"StringFunction {self.name}")

Expand Down
46 changes: 46 additions & 0 deletions python/cudf_polars/tests/test_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

import pytest

import polars as pl

from cudf_polars.testing.asserts import assert_gpu_result_equal


@pytest.mark.parametrize(
"substr",
[
"A",
"de",
".*",
"^a",
"^A",
"[^a-z]",
"[a-z]{3,}",
"^[A-Z]{2,}",
"j|u",
],
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
)
def test_contains(substr):
ldf = pl.DataFrame(
{"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]}
).lazy()

query = ldf.select(pl.col("a").str.contains(substr))
assert_gpu_result_equal(query)


@pytest.mark.parametrize("pat", ["["])
def test_contains_invalid(pat):
ldf = pl.DataFrame(
{"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]}
).lazy()

query = ldf.select(pl.col("a").str.contains(pat))

with pytest.raises(pl.exceptions.ComputeError):
query.collect()
with pytest.raises(pl.exceptions.ComputeError):
query.collect(use_gpu=True)
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
Loading