Skip to content

Commit

Permalink
BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes (
Browse files Browse the repository at this point in the history
…pandas-dev#54964)

* REF: separate out _nbins_to_bins

* Cast x to Index early

* BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes

* GH ref
  • Loading branch information
jbrockmendel authored Sep 20, 2023
1 parent a98be06 commit 28f274c
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 113 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,8 @@ Strings
Interval
^^^^^^^^
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
-

Indexing
Expand Down Expand Up @@ -349,6 +351,7 @@ Styler

Other
^^^^^
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)

.. ***DO NOT USE THIS SECTION***
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool:
if isinstance(self.dtype, IntervalDtype):
if isinstance(target.dtype, IntervalDtype):
return False
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
# out code can be restored (instead of hardcoding `return True`)
# once that issue is fixed
# "Index" has no attribute "left"
# return self.left._should_compare(target) # type: ignore[attr-defined]
return True
return self.left._should_compare(target) # type: ignore[attr-defined]
return False

@final
Expand Down
168 changes: 65 additions & 103 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
Timestamp,
lib,
)
from pandas._libs.lib import infer_dtype

from pandas.core.dtypes.common import (
DT64NS_DTYPE,
ensure_platform_int,
is_bool_dtype,
is_integer,
Expand Down Expand Up @@ -243,7 +241,7 @@ def cut(

original = x
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)
x_idx, _ = _coerce_to_type(x_idx)

if not np.iterable(bins):
bins = _nbins_to_bins(x_idx, bins, right)
Expand All @@ -254,16 +252,8 @@ def cut(

else:
bins = Index(bins)
if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
bins = np.asarray(bins, dtype=DT64NS_DTYPE)
else:
bins = np.asarray(bins)
bins = _convert_bin_to_numeric_type(bins, dtype)

# GH 26045: cast to float64 to avoid an overflow
if (np.diff(bins.astype("float64")) < 0).any():
if not bins.is_monotonic_increasing:
raise ValueError("bins must increase monotonically.")
bins = Index(bins)

fac, bins = _bins_to_cuts(
x_idx,
Expand All @@ -272,12 +262,11 @@ def cut(
labels=labels,
precision=precision,
include_lowest=include_lowest,
dtype=dtype,
duplicates=duplicates,
ordered=ordered,
)

return _postprocess_for_cut(fac, bins, retbins, dtype, original)
return _postprocess_for_cut(fac, bins, retbins, original)


def qcut(
Expand Down Expand Up @@ -343,25 +332,22 @@ def qcut(
"""
original = x
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)
x_idx, _ = _coerce_to_type(x_idx)

quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q

x_np = np.asarray(x_idx)
x_np = x_np[~np.isnan(x_np)]
bins = np.quantile(x_np, quantiles)
bins = x_idx.to_series().dropna().quantile(quantiles)

fac, bins = _bins_to_cuts(
x_idx,
Index(bins),
labels=labels,
precision=precision,
include_lowest=True,
dtype=dtype,
duplicates=duplicates,
)

return _postprocess_for_cut(fac, bins, retbins, dtype, original)
return _postprocess_for_cut(fac, bins, retbins, original)


def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
Expand All @@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
rng = (x_idx.min(), x_idx.max())
mn, mx = rng

if np.isinf(mn) or np.isinf(mx):
is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
x_idx.dtype, DatetimeTZDtype
)

if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)

if mn == mx: # adjust end points before binning
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
if is_dt_or_td:
# using seconds=1 is pretty arbitrary here
td = Timedelta(seconds=1)
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
)
else:
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001

bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
if is_dt_or_td:
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn, end=mx, periods=nbins + 1, freq=None
)
else:
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
Expand All @@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:


def _bins_to_cuts(
x: Index,
x_idx: Index,
bins: Index,
right: bool = True,
labels=None,
precision: int = 3,
include_lowest: bool = False,
dtype: DtypeObj | None = None,
duplicates: str = "raise",
ordered: bool = True,
):
Expand All @@ -422,7 +430,7 @@ def _bins_to_cuts(

if isinstance(bins, IntervalIndex):
# we have a fast-path here
ids = bins.get_indexer(x)
ids = bins.get_indexer(x_idx)
cat_dtype = CategoricalDtype(bins, ordered=True)
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
return result, bins
Expand All @@ -437,12 +445,29 @@ def _bins_to_cuts(
bins = unique_bins

side: Literal["left", "right"] = "left" if right else "right"
ids = ensure_platform_int(bins.searchsorted(x, side=side))

try:
ids = bins.searchsorted(x_idx, side=side)
except TypeError as err:
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
# is integers
if x_idx.dtype.kind == "m":
raise ValueError("bins must be of timedelta64 dtype") from err
elif x_idx.dtype.kind == bins.dtype.kind == "M":
raise ValueError(
"Cannot use timezone-naive bins with timezone-aware values, "
"or vice-versa"
) from err
elif x_idx.dtype.kind == "M":
raise ValueError("bins must be of datetime64 dtype") from err
else:
raise
ids = ensure_platform_int(ids)

if include_lowest:
ids[np.asarray(x) == bins[0]] = 1
ids[x_idx == bins[0]] = 1

na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()

if labels is not False:
Expand All @@ -454,7 +479,7 @@ def _bins_to_cuts(

if labels is None:
labels = _format_labels(
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
bins, precision, right=right, include_lowest=include_lowest
)
elif ordered and len(set(labels)) != len(labels):
raise ValueError(
Expand Down Expand Up @@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
x = Index(x_arr)

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
x = Index(x_arr)

return x, dtype


def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None):
"""
if the passed bin is of datetime/timedelta type,
this method converts it to integer
Parameters
----------
bins : list-like of bins
dtype : dtype of data
Raises
------
ValueError if bins are not of a compat dtype to dtype
"""
bins_dtype = infer_dtype(bins, skipna=False)
if lib.is_np_dtype(dtype, "m"):
if bins_dtype in ["timedelta", "timedelta64"]:
bins = to_timedelta(bins).view(np.int64)
else:
raise ValueError("bins must be of timedelta64 dtype")
elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
if bins_dtype in ["datetime", "datetime64"]:
bins = to_datetime(bins)
if lib.is_np_dtype(bins.dtype, "M"):
# As of 2.0, to_datetime may give non-nano, so we need to convert
# here until the rest of this file recognizes non-nano
bins = bins.astype("datetime64[ns]", copy=False)
bins = bins.view(np.int64)
else:
raise ValueError("bins must be of datetime64 dtype")

return bins


def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
"""
Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
datelike
Parameters
----------
bins : list-like of bins
dtype : dtype of data
Returns
-------
bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
datelike
"""
if isinstance(dtype, DatetimeTZDtype):
bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
elif lib.is_np_dtype(dtype, "mM"):
bins = Index(bins.astype(np.int64), dtype=dtype)
return bins
return Index(x), dtype


def _format_labels(
bins: Index,
precision: int,
right: bool = True,
include_lowest: bool = False,
dtype: DtypeObj | None = None,
):
"""based on the dtype, return our labels"""
closed: IntervalLeftRight = "right" if right else "left"

formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]

if isinstance(dtype, DatetimeTZDtype):
formatter = lambda x: Timestamp(x, tz=dtype.tz)
if isinstance(bins.dtype, DatetimeTZDtype):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(dtype, "M"):
formatter = Timestamp
elif lib.is_np_dtype(bins.dtype, "M"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(dtype, "m"):
formatter = Timedelta
elif lib.is_np_dtype(bins.dtype, "m"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
else:
precision = _infer_precision(precision, bins)
Expand Down Expand Up @@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
return Index(x)


def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
def _postprocess_for_cut(fac, bins, retbins: bool, original):
"""
handles post processing for the cut method where
we combine the index information if the originally passed
Expand All @@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
if not retbins:
return fac

bins = _convert_bin_to_datelike_type(bins, dtype)
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
bins = bins._values

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/interval/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ def test_get_indexer_datetime(self):
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str))
tm.assert_numpy_array_equal(result, expected)

# TODO this should probably be deprecated?
# https://github.com/pandas-dev/pandas/issues/47772
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8)
expected = np.array([-1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
26 changes: 22 additions & 4 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,15 +495,33 @@ def test_datetime_cut(data):
tm.assert_series_equal(Series(result), expected)


@pytest.mark.parametrize(
"bins",
[
3,
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
def test_datetime_tz_cut_mismatched_tzawareness(box):
# GH#54964
bins = box(
[
Timestamp("2013-01-01 04:57:07.200000"),
Timestamp("2013-01-01 21:00:00"),
Timestamp("2013-01-02 13:00:00"),
Timestamp("2013-01-03 05:00:00"),
]
)
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))

msg = "Cannot use timezone-naive bins with timezone-aware values"
with pytest.raises(ValueError, match=msg):
cut(ser, bins)


@pytest.mark.parametrize(
"bins",
[
3,
[
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
],
],
)
Expand Down

0 comments on commit 28f274c

Please sign in to comment.