Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: use ObjectEngine for indexing for now correctness over performance #60329

Merged
merged 13 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
class MaskedUInt8Engine(MaskedIndexEngine): ...
class MaskedBoolEngine(MaskedUInt8Engine): ...

class StringObjectEngine(ObjectEngine):
def __init__(self, values: object, na_value) -> None: ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # np.ndarray[..., ndim=1]
Expand Down
17 changes: 17 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,23 @@ cdef class StringEngine(IndexEngine):
raise KeyError(val)
return str(val)

cdef class StringObjectEngine(ObjectEngine):

cdef:
object na_value

def __init__(self, ndarray values, na_value):
super().__init__(values)
self.na_value = na_value

cdef _check_type(self, object val):
if isinstance(val, str):
return val
elif checknull(val):
return self.na_value
else:
raise KeyError(val)


cdef class DatetimeEngine(Int64Engine):

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ def _engine(
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
target_values = self._data._ndarray # type: ignore[union-attr]
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
return libindex.StringEngine(target_values)
return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr]

# error: Argument 1 to "ExtensionEngine" has incompatible type
# "ndarray[Any, Any]"; expected "ExtensionArray"
Expand Down Expand Up @@ -6222,6 +6222,10 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]:
# let's instead try with a straight Index
self = Index(self._values)

elif self.dtype == "string" and other.dtype == "object":
if lib.is_string_array(other._values, skipna=True): # type: ignore[arg-type]
return self, other.astype(self.dtype)

if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype):
# Reverse op so we dont need to re-implement on the subclasses
other, self = other._maybe_downcast_for_indexing(self)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import iNaT
from pandas.errors import InvalidIndexError

Expand Down Expand Up @@ -503,7 +501,6 @@ def test_setitem_ambig(self, using_infer_string):
else:
assert dm[2].dtype == np.object_

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_setitem_None(self, float_frame):
# GH #766
float_frame[None] = float_frame["A"]
Expand Down
23 changes: 15 additions & 8 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -2101,12 +2099,21 @@ def test_enum_column_equality():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_mixed_col_index_dtype():
def test_mixed_col_index_dtype(any_string_dtype):
# GH 47382
df1 = DataFrame(columns=list("abc"), data=1.0, index=[0])
df2 = DataFrame(columns=list("abc"), data=0.0, index=[0])
df1.columns = df2.columns.astype("string")
df1 = DataFrame(
columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0]
)
df2 = DataFrame(columns=Index(list("abc"), dtype="object"), data=0.0, index=[0])

result = df1 + df2
expected = DataFrame(columns=list("abc"), data=1.0, index=[0])
expected = DataFrame(
columns=Index(list("abc"), dtype=any_string_dtype), data=1.0, index=[0]
)
tm.assert_frame_equal(result, expected)

result = df2 + df1
expected = DataFrame(
columns=Index(list("abc"), dtype="object"), data=1.0, index=[0]
)
tm.assert_frame_equal(result, expected)
63 changes: 57 additions & 6 deletions pandas/tests/indexes/string/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@
import pandas._testing as tm


class TestGetLoc:
def test_get_loc(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
assert index.get_loc("b") == 1

def test_get_loc_raises(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError, match="d"):
index.get_loc("d")

def test_get_loc_invalid_value(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError, match="1"):
index.get_loc(1)

def test_get_loc_non_unique(self, any_string_dtype):
index = Index(["a", "b", "a"], dtype=any_string_dtype)
result = index.get_loc("a")
expected = np.array([True, False, True])
tm.assert_numpy_array_equal(result, expected)

def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError):
index.get_loc(nulls_fixture)

def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this test now means that you can use np.nan and pd.NA interchangeably when indexing? If that's correct, I'm not sure I agree that we should be going that far

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that we are coercing any missing value indicator to NaN upon construction, and so to preserve back compat, I think I prefer we do the same for input to indexing operations.

To express it in terms of get_loc, this works now:

>>> pd.options.future.infer_string = False
>>> pd.Index(["a", "b", None]).get_loc(None)
2

but the same on main with enabling the string dtype:

>>> pd.options.future.infer_string = True
>>> pd.Index(["a", "b", None]).get_loc(None)
...
KeyError: None

That is because now the None is no longer in the object dtype index, but has been coerced to NaN.
(on main, trying the above with np.nan also fails (see the issue #59879), but that's because the StringEngine simply wasn't set up to work with missing values, so that is the initial reason I replaced it now with the StringObjectEngine)

The above is with None, but essentially happens with any other missing value indicator, like pd.NA. Maybe None and np.nan are the most important ones though, but I would at least prefer that indexing with None keeps working for now (we can always start deprecating it, but I wouldn't do that it as a breaking change for 3.0)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is also already quite inconsistent depending on the data type .. See #59765 for an overview (e.g. also for datetimelike and categorical, we treat all NA-likes as the same in indexing lookups)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is also already quite inconsistent depending on the data type .. See #59765 for an overview (e.g. also for datetimelike and categorical, we treat all NA-likes as the same in indexing lookups)

Nice - that's a great issue. Thanks for opening it.

To express it in terms of get_loc, this works now:

Hmm I'm a bit confused by how this relates to all of the missing indicators becoming essentially equal though. On main, this does not work (?):

>>> pd.options.future.infer_string = False
>>> pd.Index(["a", "b", None]).get_loc(np.nan)
KeyError: nan

Definitely understand that there is not an ideal solution here given the inconsistent history, but I don't want to go too far and just start making all of the missing value indicators interchangeable. I think containment logic should land a little closer to equality logic, and in the latter we obviously don't allow this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On main, this does not work (?):

Yes, that's the first bug that this PR is solving: right now no missing value lookup works, not even NaN itself (which is what is stored in the array). This is because the StringEngine simply doesn't handle missing values correctly (when building the hash table, it actually converts it to a sentinel string, but then for any of the lookup methods it doesn't take that into account; it's a bit an incomplete implementation)

So by using the ObjectEngine (subclass), that fixes that first issue: ensuring NaN can be found

I think containment logic should land a little closer to equality logic, and in the latter we obviously don't allow this

Missing values don't compare equal (well, Nonedoes, but we specifically didn't choose that long term as the sentinel moving forward; np.nan and pd.NA don't compare equal), so containment is already a bit of a special case anyway compared to equality, when it comes to missing values.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair point on the equality. I guess I'm still hung up on the indexing behavior being the same though.

I've lost track of the nuance a bit, but haven't np.nan and pd.NA always had different indexing behavior? I'm just wary of glossing over that as part of this.

Maybe worth some input from @pandas-dev/pandas-core if anyone else has thoughts

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the PR to for now just enable exact matching missing values in get_loc, so this PR can already be merged (and fix the most glaring bug), and then we can have the discussion around backwards compatibility in #59879 (I don't think the above thread is very easy to follow for other people to chime in, will do a write up on the issue -> see #59879 (comment))

index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
assert index.get_loc(nulls_fixture) == 2


class TestGetIndexer:
@pytest.mark.parametrize(
"method,expected",
Expand Down Expand Up @@ -41,21 +72,41 @@ def test_get_indexer_strings_raises(self, any_string_dtype):
["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
)

@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
def test_get_indexer_missing(self, any_string_dtype, null):
# NaT and Decimal("NaN") from null_fixture are not supported for string dtype
index = Index(["a", "b", null], dtype=any_string_dtype)
result = index.get_indexer(["a", null, "c"])
expected = np.array([0, 2, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


class TestGetIndexerNonUnique:
@pytest.mark.xfail(reason="TODO(infer_string)", strict=False)
def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture):
index = Index(["a", "b", None], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
def test_get_indexer_non_unique_nas(self, request, any_string_dtype, null):
if (
any_string_dtype == "string"
and any_string_dtype.na_value is pd.NA
and isinstance(null, float)
):
# TODO(infer_string)
request.applymarker(
pytest.mark.xfail(
reason="NA-variant string dtype does not work with NaN"
)
)

index = Index(["a", "b", null], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([null])

expected_indexer = np.array([2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# actually non-unique
index = Index(["a", None, "b", None], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
index = Index(["a", null, "b", null], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([null])

expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
Expand Down
12 changes: 5 additions & 7 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2664,6 +2664,8 @@ def test_pivot_columns_not_given(self):
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
df.pivot()

# this still fails because columns=None gets passed down to unstack as level=None
# while at that point None was converted to NaN
@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
Expand All @@ -2682,10 +2684,7 @@ def test_pivot_columns_is_none(self):
expected = DataFrame({1: 3}, index=Index([2], name="b"))
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_index_is_none(self):
def test_pivot_index_is_none(self, using_infer_string):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})

Expand All @@ -2696,11 +2695,10 @@ def test_pivot_index_is_none(self):

result = df.pivot(columns="b", index=None, values="c")
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
if using_infer_string:
expected.index.name = np.nan
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_values_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
Expand Down