From 0aa8ed2113a9b198aa6c117b1dcf41bd189db4ac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 5 Jul 2023 17:27:45 +0200 Subject: [PATCH 1/3] ENH: Use explicit methods instead of regex pattern in arrow strings --- pandas/core/arrays/string_arrow.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fa56571d7b0b0..5e039827c9bfa 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -307,28 +307,25 @@ def _str_contains( return super()._str_contains(pat, case, flags, na, regex) if regex: - if case is False: - fallback_performancewarning() - return super()._str_contains(pat, case, flags, na, regex) - else: - result = pc.match_substring_regex(self._pa_array, pat) + result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: - if case: - result = pc.match_substring(self._pa_array, pat) - else: - result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper()) + result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result def _str_startswith(self, pat: str, na=None): - pat = f"^{re.escape(pat)}" - return self._str_contains(pat, na=na, regex=True) + result = pc.starts_with(self._pa_array, pattern=pat) + if not isna(na): + result = result.fill_null(na) + return type(self)(result) def _str_endswith(self, pat: str, na=None): - pat = f"{re.escape(pat)}$" - return self._str_contains(pat, na=na, regex=True) + result = pc.ends_with(self._pa_array, pattern=pat) + if not isna(na): + result = result.fill_null(na) + return type(self)(result) def _str_replace( self, From c73ae7472784d63ae20b15ea7647201ee7081088 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 5 Jul 2023 23:00:33 +0200 Subject: [PATCH 2/3] Fixup --- pandas/core/arrays/string_arrow.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5e039827c9bfa..12f4b5486b6b9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -319,13 +319,19 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - return type(self)(result) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - return type(self)(result) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result def _str_replace( self, From e97deb3bfcb17a653aef88d0fee7a68ae15417d0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Jul 2023 22:24:47 +0200 Subject: [PATCH 3/3] Fix --- pandas/tests/strings/test_find_replace.py | 26 ++++++----------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 89718b1b35f12..c3cc8b3643ed2 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -53,10 +53,8 @@ def test_contains(any_string_dtype): np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = values.str.contains("FOO|mmm", case=False) + + result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -172,10 +170,7 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = s.str.contains("a", case=False) + result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], dtype=expected_dtype, @@ -196,10 +191,7 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = s.str.contains("ba", case=False) + result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], dtype=expected_dtype, @@ -723,10 +715,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = values.str.match("ab", case=False) + result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -769,10 +758,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): expected = Series([True, True, False, False], dtype=expected_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = ser.str.fullmatch("ab", case=False) + result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(