Skip to content

Commit

Permalink
CLN: move safe_sort from core.algorithms to core.sorting (pandas-dev#…
Browse files Browse the repository at this point in the history
…17034)

COMPAT: safe_sort will only coerce list-likes to object, not a numpy string type

xref: pandas-dev#17003 (comment)
  • Loading branch information
jreback authored and alanbato committed Nov 10, 2017
1 parent a8f0369 commit fed8561
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 192 deletions.
100 changes: 1 addition & 99 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from pandas.core.dtypes.missing import isnull

from pandas.core import common as com
from pandas.compat import string_types
from pandas._libs import algos, lib, hashtable as htable
from pandas._libs.tslib import iNaT

Expand Down Expand Up @@ -431,104 +430,6 @@ def isin(comps, values):
return f(comps, values)


def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.
.. versionadded:: 0.19.0
Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.
Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.
Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError("Only list-like objects are allowed to be passed to"
"safe_sort as values")
values = np.asarray(values)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, string_types) for x in values],
dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return _ensure_object(np.concatenate([nums, strs]))

sorter = None
if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError("Only list-like objects or None are allowed to be"
"passed to safe_sort as labels")
labels = _ensure_platform_int(np.asarray(labels))

from pandas import Index
if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
(hash_klass, _), values = _get_data_algo(values, _hashtables)
t = hash_klass(len(values))
t.map_locations(values)
sorter = _ensure_platform_int(t.lookup(ordered))

reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = (labels < -len(values)) | (labels >= len(values)) | \
(labels == na_sentinel)

# (Out of bound indices will be masked with `na_sentinel` next, so we may
# deal with them here without performance loss using `mode='wrap'`.)
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)

return ordered, _ensure_platform_int(new_labels)


def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
"""
Encode input values as an enumerated type or categorical variable
Expand Down Expand Up @@ -568,6 +469,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
uniques = uniques.to_array()

if sort and len(uniques) > 0:
from pandas.core.sorting import safe_sort
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
assume_unique=True)

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import pandas.core.dtypes.concat as _concat
import pandas.core.missing as missing
import pandas.core.algorithms as algos
import pandas.core.sorting as sorting
from pandas.io.formats.printing import pprint_thing
from pandas.core.ops import _comp_method_OBJECT_ARRAY
from pandas.core.strings import StringAccessorMixin
Expand Down Expand Up @@ -2306,7 +2307,7 @@ def difference(self, other):
assume_unique=True)
the_diff = this.values.take(label_diff)
try:
the_diff = algos.safe_sort(the_diff)
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

Expand Down Expand Up @@ -2366,7 +2367,7 @@ def symmetric_difference(self, other, result_name=None):

the_diff = _concat._concat_compat([left_diff, right_diff])
try:
the_diff = algos.safe_sort(the_diff)
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from pandas.core.sorting import is_int64_overflow_possible
import pandas.core.algorithms as algos
import pandas.core.sorting as sorting
import pandas.core.common as com
from pandas._libs import hashtable as libhashtable, join as libjoin, lib
from pandas.errors import MergeError
Expand Down Expand Up @@ -1491,7 +1492,7 @@ def _sort_labels(uniques, left, right):
l = len(left)
labels = np.concatenate([left, right])

_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
new_labels = _ensure_int64(new_labels)
new_left, new_right = new_labels[:l], new_labels[l:]

Expand Down
108 changes: 107 additions & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
""" miscellaneous sorting / groupby utilities """

import numpy as np
from pandas.compat import long
from pandas.compat import long, string_types, PY3
from pandas.core.categorical import Categorical
from pandas.core.dtypes.common import (
_ensure_platform_int,
_ensure_int64,
is_list_like,
is_categorical_dtype)
from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.missing import isnull
import pandas.core.algorithms as algorithms
from pandas._libs import lib, algos, hashtable
Expand Down Expand Up @@ -376,3 +378,107 @@ def _reorder_by_uniques(uniques, labels):
uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)

return uniques, labels


def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.
.. versionadded:: 0.19.0
Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.
Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.
Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError("Only list-like objects are allowed to be passed to"
"safe_sort as values")

if not isinstance(values, np.ndarray):

# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, string_types) for x in values],
dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError("Only list-like objects or None are allowed to be"
"passed to safe_sort as labels")
labels = _ensure_platform_int(np.asarray(labels))

from pandas import Index
if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
(hash_klass, _), values = algorithms._get_data_algo(
values, algorithms._hashtables)
t = hash_klass(len(values))
t.map_locations(values)
sorter = _ensure_platform_int(t.lookup(ordered))

reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = (labels < -len(values)) | (labels >= len(values)) | \
(labels == na_sentinel)

# (Out of bound indices will be masked with `na_sentinel` next, so we may
# deal with them here without performance loss using `mode='wrap'`.)
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)

return ordered, _ensure_platform_int(new_labels)
88 changes: 0 additions & 88 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np
import pytest
import warnings

from numpy.random import RandomState
from numpy import nan
Expand Down Expand Up @@ -60,93 +59,6 @@ def test_strings(self):
tm.assert_series_equal(result, expected)


class TestSafeSort(object):

def test_basic_sort(self):
values = [3, 1, 2, 0, 4]
result = algos.safe_sort(values)
expected = np.array([0, 1, 2, 3, 4])
tm.assert_numpy_array_equal(result, expected)

values = list("baaacb")
result = algos.safe_sort(values)
expected = np.array(list("aaabbc"))
tm.assert_numpy_array_equal(result, expected)

values = []
result = algos.safe_sort(values)
expected = np.array([])
tm.assert_numpy_array_equal(result, expected)

def test_labels(self):
values = [3, 1, 2, 0, 4]
expected = np.array([0, 1, 2, 3, 4])

labels = [0, 1, 1, 2, 3, 0, -1, 4]
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

# na_sentinel
labels = [0, 1, 1, 2, 3, 0, 99, 4]
result, result_labels = algos.safe_sort(values, labels,
na_sentinel=99)
expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

# out of bound indices
labels = [0, 101, 102, 2, 3, 0, 99, 4]
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

labels = []
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

def test_mixed_integer(self):
values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
result = algos.safe_sort(values)
expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
tm.assert_numpy_array_equal(result, expected)

values = np.array(['b', 1, 0, 'a'], dtype=object)
labels = [0, 1, 2, 3, 0, -1, 1]
result, result_labels = algos.safe_sort(values, labels)
expected = np.array([0, 1, 'a', 'b'], dtype=object)
expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

def test_unsortable(self):
# GH 13714
arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
if compat.PY2 and not pd._np_version_under1p10:
# RuntimeWarning: tp_compare didn't return -1 or -2 for exception
with warnings.catch_warnings():
pytest.raises(TypeError, algos.safe_sort, arr)
else:
pytest.raises(TypeError, algos.safe_sort, arr)

def test_exceptions(self):
with tm.assert_raises_regex(TypeError,
"Only list-like objects are allowed"):
algos.safe_sort(values=1)

with tm.assert_raises_regex(TypeError,
"Only list-like objects or None"):
algos.safe_sort(values=[0, 1, 2], labels=1)

with tm.assert_raises_regex(ValueError,
"values should be unique"):
algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1])


class TestFactorize(object):

def test_basic(self):
Expand Down
Loading

0 comments on commit fed8561

Please sign in to comment.