From 091baa8c975aa68f6e04d13229cb716bb994e723 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Oct 2024 16:14:09 +0100 Subject: [PATCH 01/10] String dtype: use ObjectEngine for indexing for now correctness over performance --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4a90b164c89cc..af2f83927f197 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -875,8 +875,7 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] - elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + # TODO re-enable StringEngine for string dtype # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" From cfb73f5ab1c4d9c9e78751ff28a2ae1997ae473a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 17:36:14 +0100 Subject: [PATCH 02/10] add string-specific ObjectEngine subclass for pre-processing of input values --- pandas/_libs/index.pyi | 1 + pandas/_libs/index.pyx | 10 ++++++++++ pandas/core/indexes/base.py | 2 ++ 3 files changed, 13 insertions(+) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..99b45f474e4f3 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -54,6 +54,7 @@ class UInt16Engine(IndexEngine): ... class UInt8Engine(IndexEngine): ... class ObjectEngine(IndexEngine): ... class StringEngine(IndexEngine): ... +class StringObjectEngine(ObjectEngine): ... class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..94698aadac771 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,16 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif checknull(val): + return np.nan + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index af2f83927f197..26f2e27924493 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,6 +876,8 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] # TODO re-enable StringEngine for string dtype + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values) # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" From 6892f8364f5595ec097b0fb9f039ee058cf7ac7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 19:55:37 +0100 Subject: [PATCH 03/10] remove xfails --- pandas/tests/frame/indexing/test_indexing.py | 3 --- pandas/tests/reshape/test_pivot.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 84c01e0be3b6f..a9bc485283985 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import iNaT from pandas.errors import InvalidIndexError @@ -503,7 +501,6 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d8a9acdc561fd..3f48d6af7df2a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2698,9 +2698,6 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From e007299cebe221d396fef2046bdc510450457e6d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 20:14:26 +0100 Subject: [PATCH 04/10] add tests for get_loc + fix for NA variant of string dtype --- pandas/_libs/index.pyx | 9 +++++- pandas/core/indexes/base.py | 3 +- pandas/tests/indexes/string/test_indexing.py | 31 ++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 94698aadac771..c219d0b63870f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -559,11 +559,18 @@ cdef class StringEngine(IndexEngine): cdef class StringObjectEngine(ObjectEngine): + cdef: + object na_value + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + cdef _check_type(self, object val): if isinstance(val, str): return val elif checknull(val): - return np.nan + return self.na_value else: raise KeyError(val) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 26f2e27924493..71abcd2c6e13f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -875,9 +875,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] - # TODO re-enable StringEngine for string dtype elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringObjectEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..4b2de683c353e 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,37 @@ import pandas._testing as tm +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", From bb148ba8a40dd9d4344ad9464ed5396473824aff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 20:51:04 +0100 Subject: [PATCH 05/10] support get_indexer --- pandas/core/indexes/base.py | 4 +++ pandas/tests/indexes/string/test_indexing.py | 32 ++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 71abcd2c6e13f..837a678736cd0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6222,6 +6222,10 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # let's instead try with a straight Index self = Index(self._values) + elif self.dtype == "string" and other.dtype == "object": + if lib.is_string_array(other._values, skipna=True): + return self, other.astype(self.dtype) + if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_downcast_for_indexing(self) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 4b2de683c353e..dc8e1c0ba9bee 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -72,12 +72,32 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + expected = np.array([0, 2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas(self, request, any_string_dtype, null): + if ( + any_string_dtype == "string" + and any_string_dtype.na_value is pd.NA + and isinstance(null, float) + ): + # TODO(infer_string) + request.applymarker( + pytest.mark.xfail( + reason="NA-variant string dtype does not work with NaN" + ) + ) + + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([null]) expected_indexer = np.array([2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) @@ -85,8 +105,8 @@ def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([null]) expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) From a669d75f661783f0e1029efca621319ce2e2f38e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 16 Nov 2024 15:26:14 +0100 Subject: [PATCH 06/10] update tests --- pandas/tests/frame/test_arithmetic.py | 23 +++++++++++++++-------- pandas/tests/reshape/test_pivot.py | 9 +++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b61fe8b05219..f9342cf33f6f9 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -2101,12 +2099,21 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(any_string_dtype): # GH 47382 - df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) - df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) - df1.columns = df2.columns.astype("string") + df1 = DataFrame( + columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0] + ) + df2 = DataFrame(columns=Index(list("abc"), dtype="object"), data=0.0, index=[0]) + result = df1 + df2 - expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + expected = DataFrame( + columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0] + ) + tm.assert_frame_equal(result, expected) + + result = df2 + df1 + expected = DataFrame( + columns=Index(list("abc"), dtype="object"), data=1.0, index=[0] + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3f48d6af7df2a..a70f31ddc8c62 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2664,6 +2664,8 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2682,10 +2684,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2696,6 +2695,8 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) def test_pivot_values_is_none(self): From fccd220c32977937932042cdd57986fd73775631 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Nov 2024 09:58:31 +0100 Subject: [PATCH 07/10] update xfail for parser test --- pandas/tests/io/parser/common/test_common_basic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 From 81423007ea7ba53518c2e8007a86766c3b66a8f5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Nov 2024 10:02:17 +0100 Subject: [PATCH 08/10] try fix typing --- pandas/_libs/index.pyi | 4 +++- pandas/core/indexes/base.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 99b45f474e4f3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -54,7 +54,6 @@ class UInt16Engine(IndexEngine): ... class UInt8Engine(IndexEngine): ... class ObjectEngine(IndexEngine): ... class StringEngine(IndexEngine): ... -class StringObjectEngine(ObjectEngine): ... class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... @@ -73,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2eaa2c7944fd2..6c363864b5d03 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringObjectEngine(target_values, self.dtype.na_value) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -6223,7 +6223,7 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: self = Index(self._values) elif self.dtype == "string" and other.dtype == "object": - if lib.is_string_array(other._values, skipna=True): + if lib.is_string_array(other._values, skipna=True): # type: ignore[arg-type] return self, other.astype(self.dtype) if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): From 43a3edfbc2bd871b5879acfbf882dd28678cafc1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 Nov 2024 16:46:55 +0100 Subject: [PATCH 09/10] limit get_loc to exact match for now --- pandas/_libs/index.pyx | 10 +++++- pandas/core/indexes/base.py | 5 --- pandas/tests/frame/indexing/test_indexing.py | 3 ++ pandas/tests/frame/test_arithmetic.py | 23 +++++------- pandas/tests/indexes/string/test_indexing.py | 38 ++++++++++---------- pandas/tests/reshape/test_pivot.py | 12 ++++--- 6 files changed, 47 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c219d0b63870f..688f943760d1f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -561,15 +561,23 @@ cdef class StringObjectEngine(ObjectEngine): cdef: object na_value + bint uses_na def __init__(self, ndarray values, na_value): super().__init__(values) self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) cdef _check_type(self, object val): if isinstance(val, str): return val - elif checknull(val): + elif self._checknull(val): return self.na_value else: raise KeyError(val) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6c363864b5d03..165fe109c4c94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6222,10 +6221,6 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # let's instead try with a straight Index self = Index(self._values) - elif self.dtype == "string" and other.dtype == "object": - if lib.is_string_array(other._values, skipna=True): # type: ignore[arg-type] - return self, other.astype(self.dtype) - if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_downcast_for_indexing(self) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a9bc485283985..84c01e0be3b6f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import iNaT from pandas.errors import InvalidIndexError @@ -501,6 +503,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f9342cf33f6f9..6b61fe8b05219 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -2099,21 +2101,12 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -def test_mixed_col_index_dtype(any_string_dtype): +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +def test_mixed_col_index_dtype(): # GH 47382 - df1 = DataFrame( - columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0] - ) - df2 = DataFrame(columns=Index(list("abc"), dtype="object"), data=0.0, index=[0]) - + df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) + df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) + df1.columns = df2.columns.astype("string") result = df1 + df2 - expected = DataFrame( - columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0] - ) - tm.assert_frame_equal(result, expected) - - result = df2 + df1 - expected = DataFrame( - columns=Index(list("abc"), dtype="object"), data=1.0, index=[0] - ) + expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index dc8e1c0ba9bee..3afcec9d73fdd 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,13 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + class TestGetLoc: def test_get_loc(self, any_string_dtype): index = Index(["a", "b", "c"], dtype=any_string_dtype) @@ -34,7 +41,14 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): def test_get_loc_missing(self, any_string_dtype, nulls_fixture): index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) - assert index.get_loc(nulls_fixture) == 2 + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 class TestGetIndexer: @@ -83,32 +97,20 @@ def test_get_indexer_missing(self, any_string_dtype, null): class TestGetIndexerNonUnique: @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) - def test_get_indexer_non_unique_nas(self, request, any_string_dtype, null): - if ( - any_string_dtype == "string" - and any_string_dtype.na_value is pd.NA - and isinstance(null, float) - ): - # TODO(infer_string) - request.applymarker( - pytest.mark.xfail( - reason="NA-variant string dtype does not work with NaN" - ) - ) - + def test_get_indexer_non_unique_nas(self, any_string_dtype, null): index = Index(["a", "b", null], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([null]) + indexer, missing = index.get_indexer_non_unique(["a", null]) - expected_indexer = np.array([2], dtype=np.intp) + expected_indexer = np.array([0, 2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique index = Index(["a", null, "b", null], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([null]) + indexer, missing = index.get_indexer_non_unique(["a", null]) - expected_indexer = np.array([1, 3], dtype=np.intp) + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 374d236c8ff39..f42f7f8232229 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2668,8 +2668,6 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - # this still fails because columns=None gets passed down to unstack as level=None - # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2688,7 +2686,10 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - def test_pivot_index_is_none(self, using_infer_string): + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) + def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2699,10 +2700,11 @@ def test_pivot_index_is_none(self, using_infer_string): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) - if using_infer_string: - expected.index.name = np.nan tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From c546a518aedbac0a96a4f2aea7ea8d18cb8a4d81 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 Nov 2024 17:20:33 +0100 Subject: [PATCH 10/10] fix for non-infer_string mode --- pandas/tests/indexes/string/test_indexing.py | 41 +++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 3afcec9d73fdd..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -87,22 +87,43 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ) @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) - def test_get_indexer_missing(self, any_string_dtype, null): + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): # NaT and Decimal("NaN") from null_fixture are not supported for string dtype index = Index(["a", "b", null], dtype=any_string_dtype) result = index.get_indexer(["a", null, "c"]) - expected = np.array([0, 2, -1], dtype=np.intp) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) class TestGetIndexerNonUnique: @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) - def test_get_indexer_non_unique_nas(self, any_string_dtype, null): + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): index = Index(["a", "b", null], dtype=any_string_dtype) indexer, missing = index.get_indexer_non_unique(["a", null]) - expected_indexer = np.array([0, 2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) @@ -110,7 +131,15 @@ def test_get_indexer_non_unique_nas(self, any_string_dtype, null): index = Index(["a", null, "b", null], dtype=any_string_dtype) indexer, missing = index.get_indexer_non_unique(["a", null]) - expected_indexer = np.array([0, 1, 3], dtype=np.intp) + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing)