From fed8561154aae888b53eb89e87ba525070e20529 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 20 Jul 2017 10:23:39 -0400 Subject: [PATCH] CLN: move safe_sort from core.algorithms to core.sorting (#17034) COMPAT: safe_sort will only coerce list-likes to object, not a numpy string type xref: https://github.com/pandas-dev/pandas/pull/17003#discussion_r128332208 --- pandas/core/algorithms.py | 100 +------------------------------- pandas/core/indexes/base.py | 5 +- pandas/core/reshape/merge.py | 3 +- pandas/core/sorting.py | 108 ++++++++++++++++++++++++++++++++++- pandas/tests/test_algos.py | 88 ---------------------------- pandas/tests/test_sorting.py | 98 ++++++++++++++++++++++++++++++- 6 files changed, 210 insertions(+), 192 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 79beb95d93ea15..3ccd7216fa81a4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -30,7 +30,6 @@ from pandas.core.dtypes.missing import isnull from pandas.core import common as com -from pandas.compat import string_types from pandas._libs import algos, lib, hashtable as htable from pandas._libs.tslib import iNaT @@ -431,104 +430,6 @@ def isin(comps, values): return f(comps, values) -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): - """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. - Safe for use with mixed types (int, str), orders ints before strs. - - .. versionadded:: 0.19.0 - - Parameters - ---------- - values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like - Indices to ``values``. All out of bound indices are treated as - "not found" and will be masked with ``na_sentinel``. - na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. - assume_unique : bool, default False - When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. - - Returns - ------- - ordered : ndarray - Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. - - Raises - ------ - TypeError - * If ``values`` is not list-like or if ``labels`` is neither None - nor list-like - * If ``values`` cannot be sorted - ValueError - * If ``labels`` is not None and ``values`` contain duplicates. - """ - if not is_list_like(values): - raise TypeError("Only list-like objects are allowed to be passed to" - "safe_sort as values") - values = np.asarray(values) - - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, string_types) for x in values], - dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return _ensure_object(np.concatenate([nums, strs])) - - sorter = None - if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) - else: - try: - sorter = values.argsort() - ordered = values.take(sorter) - except TypeError: - # try this anyway - ordered = sort_mixed(values) - - # labels: - - if labels is None: - return ordered - - if not is_list_like(labels): - raise TypeError("Only list-like objects or None are allowed to be" - "passed to safe_sort as labels") - labels = _ensure_platform_int(np.asarray(labels)) - - from pandas import Index - if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") - - if sorter is None: - # mixed types - (hash_klass, _), values = _get_data_algo(values, _hashtables) - t = hash_klass(len(values)) - t.map_locations(values) - sorter = _ensure_platform_int(t.lookup(ordered)) - - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = (labels < -len(values)) | (labels >= len(values)) | \ - (labels == na_sentinel) - - # (Out of bound indices will be masked with `na_sentinel` next, so we may - # deal with them here without performance loss using `mode='wrap'`.) - new_labels = reverse_indexer.take(labels, mode='wrap') - np.putmask(new_labels, mask, na_sentinel) - - return ordered, _ensure_platform_int(new_labels) - - def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -568,6 +469,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.to_array() if sort and len(uniques) > 0: + from pandas.core.sorting import safe_sort uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5d50f961927c78..c95a9598604eef 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -48,6 +48,7 @@ import pandas.core.dtypes.concat as _concat import pandas.core.missing as missing import pandas.core.algorithms as algos +import pandas.core.sorting as sorting from pandas.io.formats.printing import pprint_thing from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin @@ -2306,7 +2307,7 @@ def difference(self, other): assume_unique=True) the_diff = this.values.take(label_diff) try: - the_diff = algos.safe_sort(the_diff) + the_diff = sorting.safe_sort(the_diff) except TypeError: pass @@ -2366,7 +2367,7 @@ def symmetric_difference(self, other, result_name=None): the_diff = _concat._concat_compat([left_diff, right_diff]) try: - the_diff = algos.safe_sort(the_diff) + the_diff = sorting.safe_sort(the_diff) except TypeError: pass diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index beebe06e7477e3..8e4367a6784da0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -38,6 +38,7 @@ from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos +import pandas.core.sorting as sorting import pandas.core.common as com from pandas._libs import hashtable as libhashtable, join as libjoin, lib from pandas.errors import MergeError @@ -1491,7 +1492,7 @@ def _sort_labels(uniques, left, right): l = len(left) labels = np.concatenate([left, right]) - _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) new_labels = _ensure_int64(new_labels) new_left, new_right = new_labels[:l], new_labels[l:] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 10b80cbc3483dd..44a27bb5cbae13 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,12 +1,14 @@ """ miscellaneous sorting / groupby utilities """ import numpy as np -from pandas.compat import long +from pandas.compat import long, string_types, PY3 from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, + is_list_like, is_categorical_dtype) +from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.missing import isnull import pandas.core.algorithms as algorithms from pandas._libs import lib, algos, hashtable @@ -376,3 +378,107 @@ def _reorder_by_uniques(uniques, labels): uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) return uniques, labels + + +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError("Only list-like objects are allowed to be passed to" + "safe_sort as values") + + if not isinstance(values, np.ndarray): + + # don't convert to string types + dtype, _ = infer_dtype_from_array(values) + values = np.asarray(values, dtype=dtype) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, string_types) for x in values], + dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + sorter = None + if PY3 and lib.infer_dtype(values) == 'mixed-integer': + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError("Only list-like objects or None are allowed to be" + "passed to safe_sort as labels") + labels = _ensure_platform_int(np.asarray(labels)) + + from pandas import Index + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + (hash_klass, _), values = algorithms._get_data_algo( + values, algorithms._hashtables) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = _ensure_platform_int(t.lookup(ordered)) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = (labels < -len(values)) | (labels >= len(values)) | \ + (labels == na_sentinel) + + # (Out of bound indices will be masked with `na_sentinel` next, so we may + # deal with them here without performance loss using `mode='wrap'`.) + new_labels = reverse_indexer.take(labels, mode='wrap') + np.putmask(new_labels, mask, na_sentinel) + + return ordered, _ensure_platform_int(new_labels) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4588bf17fdbeb7..9e7b97f19e0c39 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2,7 +2,6 @@ import numpy as np import pytest -import warnings from numpy.random import RandomState from numpy import nan @@ -60,93 +59,6 @@ def test_strings(self): tm.assert_series_equal(result, expected) -class TestSafeSort(object): - - def test_basic_sort(self): - values = [3, 1, 2, 0, 4] - result = algos.safe_sort(values) - expected = np.array([0, 1, 2, 3, 4]) - tm.assert_numpy_array_equal(result, expected) - - values = list("baaacb") - result = algos.safe_sort(values) - expected = np.array(list("aaabbc")) - tm.assert_numpy_array_equal(result, expected) - - values = [] - result = algos.safe_sort(values) - expected = np.array([]) - tm.assert_numpy_array_equal(result, expected) - - def test_labels(self): - values = [3, 1, 2, 0, 4] - expected = np.array([0, 1, 2, 3, 4]) - - labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) - - # na_sentinel - labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = algos.safe_sort(values, labels, - na_sentinel=99) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) - - # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) - - labels = [] - result, result_labels = algos.safe_sort(values, labels) - expected_labels = np.array([], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) - - def test_mixed_integer(self): - values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) - result = algos.safe_sort(values) - expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - values = np.array(['b', 1, 0, 'a'], dtype=object) - labels = [0, 1, 2, 3, 0, -1, 1] - result, result_labels = algos.safe_sort(values, labels) - expected = np.array([0, 1, 'a', 'b'], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) - - def test_unsortable(self): - # GH 13714 - arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - if compat.PY2 and not pd._np_version_under1p10: - # RuntimeWarning: tp_compare didn't return -1 or -2 for exception - with warnings.catch_warnings(): - pytest.raises(TypeError, algos.safe_sort, arr) - else: - pytest.raises(TypeError, algos.safe_sort, arr) - - def test_exceptions(self): - with tm.assert_raises_regex(TypeError, - "Only list-like objects are allowed"): - algos.safe_sort(values=1) - - with tm.assert_raises_regex(TypeError, - "Only list-like objects or None"): - algos.safe_sort(values=[0, 1, 2], labels=1) - - with tm.assert_raises_regex(ValueError, - "values should be unique"): - algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) - - class TestFactorize(object): def test_basic(self): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index e09270bcadf270..f6973cccb82b02 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -1,6 +1,8 @@ import pytest from itertools import product from collections import defaultdict +import warnings +from datetime import datetime import numpy as np from numpy import nan @@ -13,7 +15,8 @@ decons_group_index, get_group_index, nargsort, - lexsort_indexer) + lexsort_indexer, + safe_sort) class TestSorting(object): @@ -340,3 +343,96 @@ def testit(label_list, shape): shape = (10000, 10000) label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] testit(label_list, shape) + + +class TestSafeSort(object): + + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = safe_sort(values) + expected = np.array(list("aaabbc"), dtype='object') + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + def test_labels(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + labels = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_labels = safe_sort(values, labels) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # na_sentinel + labels = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_labels = safe_sort(values, labels, + na_sentinel=99) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_labels = safe_sort(values, labels) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + labels = [] + result, result_labels = safe_sort(values, labels) + expected_labels = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_integer(self): + values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + result = safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(['b', 1, 0, 'a'], dtype=object) + labels = [0, 1, 2, 3, 0, -1, 1] + result, result_labels = safe_sort(values, labels) + expected = np.array([0, 1, 'a', 'b'], dtype=object) + expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_interger_from_list(self): + values = ['b', 1, 0, 'a', 0, 'b'] + result = safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_unsortable(self): + # GH 13714 + arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) + if compat.PY2 and not pd._np_version_under1p10: + # RuntimeWarning: tp_compare didn't return -1 or -2 for exception + with warnings.catch_warnings(): + pytest.raises(TypeError, safe_sort, arr) + else: + pytest.raises(TypeError, safe_sort, arr) + + def test_exceptions(self): + with tm.assert_raises_regex(TypeError, + "Only list-like objects are allowed"): + safe_sort(values=1) + + with tm.assert_raises_regex(TypeError, + "Only list-like objects or None"): + safe_sort(values=[0, 1, 2], labels=1) + + with tm.assert_raises_regex(ValueError, + "values should be unique"): + safe_sort(values=[0, 1, 2, 1], labels=[0, 1])