diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 45ef47fde0a56..b69efb4689486 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype): self.idx.quantile(quantile, interpolation=interpolation) +class SortIntegerArray: + params = [10**3, 10**5] + + def setup(self, N): + data = np.arange(N, dtype=float) + data[40] = np.nan + self.array = pd.array(data, dtype='Int64') + + def time_argsort(self, N): + self.array.argsort() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ea6a04ac726b7..5c22a3bcee227 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`). *Previous behavior* -.. code-block:: python +.. code-block:: ipython In [5]: np.power(s1, s2) Out[5]: @@ -684,6 +684,36 @@ applying the ufunc. np.power(s1, s2.array) +Categorical.argsort now places missing values at the end +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Categorical.argsort` now places missing values at the end of the array, making it +consistent with NumPy and the rest of pandas (:issue:`21801`). + +.. ipython:: python + + cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + +*Previous behavior* + +.. code-block:: ipython + + In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + + In [3]: cat.argsort() + Out[3]: array([1, 2, 0]) + + In [4]: cat[cat.argsort()] + Out[4]: + [NaN, a, b] + categories (2, object): [a < b] + +*New behavior* + +.. ipython:: python + + cat.argsort() + cat[cat.argsort()] .. _whatsnew_0250.api_breaking.deps: @@ -767,6 +797,7 @@ Other API changes - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`) - :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`) - Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0762a607f20ae..803a31928ab7a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,6 +23,7 @@ from pandas._typing import ArrayLike from pandas.core import ops +from pandas.core.sorting import nargsort _not_implemented_message = "{} does not implement {}." @@ -409,7 +410,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): Returns ------- index_array : ndarray - Array of indices that sort ``self``. + Array of indices that sort ``self``. If NaN values are contained, + NaN values are placed at the end. See Also -------- @@ -420,10 +422,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] + + result = nargsort(self, kind=kind, ascending=ascending, + na_position='last') return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a4846c98bd22..e901c11cf3054 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1531,13 +1531,14 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, *args, **kwargs): - # TODO(PY2): use correct signature - # We have to do *args, **kwargs to avoid a a py2-only signature - # issue since np.argsort differs from argsort. + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Return the indices that would sort the Categorical. + .. versionchanged:: 0.25.0 + + Changed to sort missing values at the end. + Parameters ---------- ascending : bool, default True @@ -1574,9 +1575,14 @@ def argsort(self, *args, **kwargs): ... ordered=True) >>> cat.argsort() array([3, 0, 1, 2]) + + Missing values are placed at the end + + >>> cat = pd.Categorical([2, None, 1]) + >>> cat.argsort() + array([2, 0, 1]) """ - # Keep the implementation here just for the docstring. - return super().argsort(*args, **kwargs) + return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 750a4c903176f..b79390581612b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): items = extract_array(items) mask = np.asarray(isna(items)) - # specially handle Categorical - if is_categorical_dtype(items): - if na_position not in {'first', 'last'}: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - - cnt_null = mask.sum() - sorted_idx = items.argsort(ascending=ascending, kind=kind) - if ascending and na_position == 'last': - # NaN is coded as -1 and is listed in front after sorting - sorted_idx = np.roll(sorted_idx, -cnt_null) - elif not ascending and na_position == 'first': - # NaN is coded as -1 and is listed in the end after sorting - sorted_idx = np.roll(sorted_idx, cnt_null) - return sorted_idx if is_extension_array_dtype(items): items = items._values_for_argsort() diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index d9e61e6a227e6..9b154a8afeabc 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -47,6 +47,14 @@ def test_argsort(self, data_for_sorting): expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_missing_array(self, data_missing_for_sorting): + result = data_missing_for_sorting.argsort() + expected = np.array([2, 0, 1], dtype=np.dtype("int")) + # we don't care whether it's int32 or int64 + result = result.astype("int64", casting="safe") + expected = expected.astype("int64", casting="safe") + tm.assert_numpy_array_equal(result, expected) + def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 96aeb608ba3b8..11de77f6779e6 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self): assert_frame_equal(result, expected) result = df.sort_index(ascending=False) - expected = df.iloc[[3, 2, 5, 1, 0, 4]] + expected = df.iloc[[2, 3, 0, 1, 5, 4]] assert_frame_equal(result, expected) def test_sort_index(self): @@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self): reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) - reversed_na_indices = sorted(na_indices, reverse=True) + reversed_na_indices = sorted(na_indices) df = pd.DataFrame({ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],