From 7a739ce00dc5b9c78f3eb83cd6cfa44058855bec Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 5 Apr 2023 14:00:35 -0400 Subject: [PATCH] Add except declaration in Cython interface for regex_program::create (#13054) Add the `except +` declaration to the `cudf::strings::regex_program::create()` function in the Cython `regex_program.pxd` interface since invalid regex patterns are thrown by this call. This allows the normal Cython exception handling to pass the exception to the Python logic without aborting the process. Closes #13052 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/13054 --- .../cudf/_lib/cpp/strings/regex_program.pxd | 7 +++++-- python/cudf/cudf/_lib/strings/contains.pyx | 14 +++++++------- python/cudf/cudf/_lib/strings/extract.pyx | 6 +++--- python/cudf/cudf/_lib/strings/findall.pyx | 6 +++--- python/cudf/cudf/_lib/strings/replace_re.pyx | 10 +++++----- python/cudf/cudf/_lib/strings/split/split.pyx | 18 +++++++++--------- python/cudf/cudf/tests/test_string.py | 6 ++++++ 7 files changed, 38 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd b/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd index 4859fccb752..7818c9c7d01 100644 --- a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -12,4 +12,7 @@ cdef extern from "cudf/strings/regex/regex_program.hpp" \ cdef cppclass regex_program: @staticmethod - unique_ptr[regex_program] create(string pattern, regex_flags flags) + unique_ptr[regex_program] create( + string pattern, + regex_flags flags + ) except + diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 007d28c21d7..82034f7f8b7 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cython.operator cimport dereference from libc.stdint cimport uint32_t @@ -35,10 +35,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): cdef string reg_ex_string = str(reg_ex).encode() cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(reg_ex_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(reg_ex_string, c_flags)) c_result = move(cpp_contains_re( source_view, dereference(c_prog) @@ -58,10 +58,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): cdef string reg_ex_string = str(reg_ex).encode() cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(reg_ex_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(reg_ex_string, c_flags)) c_result = move(cpp_count_re( source_view, dereference(c_prog) @@ -81,10 +81,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): cdef string reg_ex_string = str(reg_ex).encode() cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(reg_ex_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(reg_ex_string, c_flags)) c_result = move(cpp_matches_re( source_view, dereference(c_prog) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 04b0140bc73..d3d8610cdf0 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cython.operator cimport dereference from libc.stdint cimport uint32_t @@ -30,10 +30,10 @@ def extract(Column source_strings, object pattern, uint32_t flags): cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_extract( source_view, dereference(c_prog) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index a3a835025bf..6df1d32dcfe 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from cython.operator cimport dereference from libc.stdint cimport uint32_t @@ -27,10 +27,10 @@ def findall(Column source_strings, object pattern, uint32_t flags): cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_findall( source_view, dereference(c_prog) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx index 9dc47195185..1fbbaa8f44f 100644 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ b/python/cudf/cudf/_lib/strings/replace_re.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -43,10 +43,10 @@ def replace_re(Column source_strings, cdef const string_scalar* scalar_repl = \ (repl.get_raw_ptr()) cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_replace_re( source_view, dereference(c_prog), @@ -73,10 +73,10 @@ def replace_with_backrefs( cdef string pattern_string = str(pattern).encode() cdef string repl_string = str(repl).encode() cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_replace_with_backrefs( source_view, dereference(c_prog), diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index 8e01485425d..08c7dde921f 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -163,10 +163,10 @@ def split_re(Column source_strings, cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_split_re( source_view, dereference(c_prog), @@ -192,10 +192,10 @@ def rsplit_re(Column source_strings, cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_rsplit_re( source_view, dereference(c_prog), @@ -220,10 +220,10 @@ def split_record_re(Column source_strings, cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_split_record_re( source_view, dereference(c_prog), @@ -248,10 +248,10 @@ def rsplit_record_re(Column source_strings, cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog = \ - regex_program.create(pattern_string, c_flags) + cdef unique_ptr[regex_program] c_prog with nogil: + c_prog = move(regex_program.create(pattern_string, c_flags)) c_result = move(cpp_rsplit_record_re( source_view, dereference(c_prog), diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 10208611f13..c866e064366 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -784,6 +784,12 @@ def test_string_extract(ps_gs, pat, expand, flags, flags_raise): assert_eq(expect, got) +def test_string_invalid_regex(): + gs = cudf.Series(["a"]) + with pytest.raises(RuntimeError): + gs.str.extract(r"{\}") + + @pytest.mark.parametrize( "pat,regex", [