Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: ExtensionArray.argsort places the missing value at the end #27137

Merged
merged 12 commits into from
Jul 3, 2019
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)


class EASorting:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
params = [10**3, 10**5]

def setup(self, N):
data = np.arange(N, dtype=float)
data[40] = np.nan
self.array = pd.array(data, dtype='Int64')

def time_argsort(self, N):
self.array.argsort()


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ Other API changes
- Removed support of gtk package for clipboards (:issue:`26563`)
- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array, except :class:`Categorical` (:issue:`21801`)
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved

.. _whatsnew_0250.deprecations:

Expand Down
11 changes: 6 additions & 5 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from pandas._typing import ArrayLike
from pandas.core import ops
from pandas.core.sorting import nargsort

_not_implemented_message = "{} does not implement {}."

Expand Down Expand Up @@ -398,7 +399,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
Returns
-------
index_array : ndarray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so we aren't specific that these are actually intp (and NOT int64's)

Array of indices that sort ``self``.
Array of indices that sort ``self``. If NaN values are contained,
NaN values are placed at the end.

See Also
--------
Expand All @@ -409,10 +411,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
# 1. _values_for_argsort : construct the values passed to np.argsort
# 2. argsort : total control over sorting.
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
values = self._values_for_argsort()
result = np.argsort(values, kind=kind, **kwargs)
if not ascending:
result = result[::-1]

result = nargsort(self, kind=kind, ascending=ascending,
na_position='last')
return result

def fillna(self, value=None, method=None, limit=None):
Expand Down
10 changes: 7 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1509,7 +1509,7 @@ def check_for_ordered(self, op):
def _values_for_argsort(self):
return self._codes.copy()

def argsort(self, *args, **kwargs):
def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
# TODO(PY2): use correct signature
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
# We have to do *args, **kwargs to avoid a a py2-only signature
# issue since np.argsort differs from argsort.
Expand Down Expand Up @@ -1553,8 +1553,12 @@ def argsort(self, *args, **kwargs):
>>> cat.argsort()
array([3, 0, 1, 2])
"""
# Keep the implementation here just for the docstring.
return super().argsort(*args, **kwargs)
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
values = self._values_for_argsort()
result = np.argsort(values, kind=kind, **kwargs)
if not ascending:
result = result[::-1]
return result

def sort_values(self, inplace=False, ascending=True, na_position='last'):
"""
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def test_argsort(self, data_for_sorting):
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
self.assert_series_equal(result, expected)

def test_argsort_nan_last(self, data_missing_for_sorting):
# GH 21801
result = data_missing_for_sorting.argsort()
assert result[-1] == 1

def test_argsort_missing(self, data_missing_for_sorting):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,11 @@ def test_searchsorted(self, data_for_sorting):
if not data_for_sorting.ordered:
raise pytest.skip(reason="searchsorted requires ordered data.")

def test_argsort_nan_last(self, data_missing_for_sorting):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
# GH 21801
# TODO: Categorical.argsort places NA values at the end
pass


class TestCasting(base.BaseCastingTests):
pass
Expand Down