Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: PandasArray._quantile when empty #46110

Merged
merged 4 commits into from
Feb 26, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import (
ArrayLike,
Scalar,
Expand Down Expand Up @@ -128,7 +127,9 @@ def _nanpercentile_1d(
values = values[~mask]

if len(values) == 0:
return np.array([na_value] * len(qs), dtype=values.dtype)
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
return np.array([na_value] * len(qs))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this would kill anything else here but na_value * np.empty((len(qs), )) should be significantly faster.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that would break things when na_value is e.g. -1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. np.zeros should work?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe you mean np.full? regardless id prefer not to bikeshed here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we could also use is

np.tile(np.array([na_value]), (len(qs, ))

This should also be faster than the list multiplication

%timeit np.array([na_value] * 1_000_000)
53.6 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit na_value * np.zeros((1_000_000, ))
1.3 ms ± 28.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit np.tile(np.array([na_value]), (1_000_000, ))
2.75 ms ± 84.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe you mean np.full? regardless id prefer not to bikeshed here

Ok with me, just wanted to mention it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to use np.full, green


return np.percentile(values, qs, **{np_percentile_argname: interpolation})

Expand Down Expand Up @@ -173,7 +174,7 @@ def _nanpercentile(
# have float result at this point, not i8
return result.astype(values.dtype)

if not lib.is_scalar(mask) and mask.any():
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):

_ndarray: np.ndarray

# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any

def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
Expand Down Expand Up @@ -462,18 +468,25 @@ def _quantile(
mask = np.atleast_2d(mask)

arr = np.atleast_2d(self._ndarray)
# TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
fill_value = self._values_for_factorize()[1]
fill_value = self._internal_fill_value

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)

result = type(self)._from_factorized(res_values, self)
res_values = self._cast_quantile_result(res_values)
result = self._from_backing_data(res_values)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]

return result

# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
"""
Cast the result of quantile_with_mask to an appropriate dtype
to pass to _from_backing_data in _quantile.
"""
return res_values

# ------------------------------------------------------------------------
# numpy-like methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_internal_fill_value = -1
# tolist is not actually deprecated, just suppressed in the __dir__
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
_typ = "categorical"
Expand Down Expand Up @@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original):
original.categories.take(uniques), dtype=original.dtype
)

def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# make sure we have correct itemsize for resulting codes
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
return res_values

def equals(self, other: object) -> bool:
"""
Returns True if categorical arrays are equal.
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):

_typ = "datetimearray"
_scalar_type = Timestamp
_internal_fill_value = np.datetime64("NaT", "ns")
_recognized_scalars = (datetime, np.datetime64)
_is_recognized_dtype = is_datetime64_any_dtype
_infer_matches = ("datetime", "datetime64", "date")
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class PandasArray(
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: PandasDtype
_internal_fill_value = np.nan

# ------------------------------------------------------------------------
# Constructors
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps):
__array_priority__ = 1000
_typ = "periodarray" # ABCPeriodArray
_scalar_type = Period
_internal_fill_value = np.int64(iNaT)
_recognized_scalars = (Period,)
_is_recognized_dtype = is_period_dtype
_infer_matches = ("period",)
Expand Down Expand Up @@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)

# TODO: alternately could override _quantile like searchsorted
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# quantile_with_mask may return float64 instead of int64, in which
# case we need to cast back
return res_values.astype(np.int64, copy=False)

# ------------------------------------------------------------------
# Arithmetic Methods

Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps):

_typ = "timedeltaarray"
_scalar_type = Timedelta
_internal_fill_value = np.timedelta64("NaT", "ns")
_recognized_scalars = (timedelta, np.timedelta64, Tick)
_is_recognized_dtype = is_timedelta64_dtype
_infer_matches = ("timedelta", "timedelta64")
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value):

with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)

def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8

expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/arrays/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,14 @@ def test_setitem_preserves_views():
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5


@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
def test_quantile_empty(dtype):
# we should get back np.nans, not -1s
arr = PandasArray(np.array([], dtype=dtype))
idx = pd.Index([0.0, 0.5])

result = arr._quantile(idx, interpolation="linear")
expected = PandasArray(np.array([np.nan, np.nan]))
tm.assert_extension_array_equal(result, expected)