Skip to content

Commit

Permalink
ENH: Consistent error messages for ranking non-numeric data. (pandas-…
Browse files Browse the repository at this point in the history
  • Loading branch information
mapehe committed Apr 12, 2018
1 parent 2d8e03b commit ff18426
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 56 deletions.
11 changes: 11 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,17 @@ def rank(values, axis=0, method='average', na_option='keep',
Whether or not to the display the returned rankings in integer form
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
"""
if is_object_dtype(values):
def raise_non_numeric_error():
raise ValueError("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")
if is_categorical_dtype(values):
if not values.ordered:
raise_non_numeric_error()
else:
raise_non_numeric_error()

if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(values, ties_method=method, ascending=ascending,
Expand Down
48 changes: 21 additions & 27 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,22 @@ def test_rank2(self):
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)

df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(1, numeric_only=False)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(0, numeric_only=False)

df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)

expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(1, numeric_only=False)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(0, numeric_only=False)

# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
Expand All @@ -110,9 +109,9 @@ def test_rank2(self):
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)

result = self.mixed_frame.rank(1)
expected = self.mixed_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
# mixed_frame["foo"] is of string-type
with tm.assert_raises_regex(ValueError, error_msg):
self.mixed_frame.rank(1)

df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
1e60, 1e80, 1e-30]})
Expand Down Expand Up @@ -218,7 +217,7 @@ def test_rank_methods_frame(self):
tm.assert_frame_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
Expand All @@ -230,15 +229,11 @@ def test_rank_descending(self):
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (df.max() - df).rank(method=method)

if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)

res3 = df.rank(method=method, ascending=False,
numeric_only=False)
Expand All @@ -258,11 +253,10 @@ def _check2d(df, expected, method='average', axis=0):
assert_frame_equal(result, exp_df)

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, axis, dtype in product(results, [0, 1], dtypes):
if (dtype, method) in disabled:
if dtype == object:
continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)
Expand Down
60 changes: 31 additions & 29 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,22 +134,27 @@ def test_rank_categorical(self):
assert_series_equal(ordered.rank(), exp)
assert_series_equal(ordered.rank(ascending=False), exp_desc)

# Unordered categoricals should be ranked as objects
# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

# Ranking unordered categorials depreciated per #19560
unordered = Series(['first', 'second', 'third', 'fourth',
'fifth', 'sixth']).astype(
CategoricalDtype(categories=['first', 'second', 'third',
'fourth', 'fifth', 'sixth'],
ordered=False))
exp_unordered = Series([2., 4., 6., 3., 1., 5.])
res = unordered.rank()
assert_series_equal(res, exp_unordered)

with tm.assert_raises_regex(ValueError, error_msg):
unordered.rank()

unordered1 = Series(
[1, 2, 3, 4, 5, 6],
).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False))
exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
res1 = unordered1.rank()
assert_series_equal(res1, exp_unordered1)

# Won't raise ValueError because entries not objects.
unordered1.rank()

# Test na_option for rank data
na_ser = Series(
Expand Down Expand Up @@ -213,16 +218,13 @@ def test_rank_signature(self):
'int64',
marks=pytest.mark.xfail(
reason="iNaT is equivalent to minimum value of dtype"
"int64 pending issue #16674")),
([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
'object')
"int64 pending issue #16674"))
])
def test_rank_inf(self, contents, dtype):
dtype_na_map = {
'float64': np.nan,
'float32': np.nan,
'int64': iNaT,
'object': None
'int64': iNaT
}
# Insert nans at random positions if underlying dtype has missing
# value. Then adjust the expected order by adding nans accordingly
Expand All @@ -249,13 +251,10 @@ def _check(s, expected, method='average'):
result = s.rank(method=method)
tm.assert_series_equal(result, Series(expected))

dtypes = [None, object]
disabled = set([(object, 'first')])
dtypes = [None]
results = self.results

for method, dtype in product(results, dtypes):
if (dtype, method) in disabled:
continue
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

Expand Down Expand Up @@ -294,7 +293,7 @@ def _check(s, method, na_option, ascending):
for dtype, na_value, pos_inf, neg_inf in dtypes:
in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
iseries = Series(in_arr, dtype=dtype)
if (dtype, method) in disabled:
if dtype == 'object':
continue
_check(iseries, method, na_option, ascending)

Expand Down Expand Up @@ -330,7 +329,7 @@ def test_rank_methods_series(self):
tm.assert_series_equal(result, expected)

def test_rank_dense_method(self):
dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']
in_out = [([1], [1]),
([2], [1]),
([0], [1]),
Expand All @@ -348,7 +347,7 @@ def test_rank_dense_method(self):
assert_series_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
Expand All @@ -360,9 +359,6 @@ def test_rank_descending(self):
expected = (s.max() - s).rank()
assert_series_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
assert_series_equal(res2, expected)
Expand All @@ -379,9 +375,15 @@ def test_rank_int(self):
def test_rank_object_bug(self):
# GH 13445

# smoke tests
Series([np.nan] * 32).astype(object).rank(ascending=True)
Series([np.nan] * 32).astype(object).rank(ascending=False)
# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

with tm.assert_raises_regex(ValueError, error_msg):
Series([np.nan] * 32).astype(object).rank(ascending=True)
with tm.assert_raises_regex(ValueError, error_msg):
Series([np.nan] * 32).astype(object).rank(ascending=False)

def test_rank_modify_inplace(self):
# GH 18521
Expand All @@ -396,7 +398,7 @@ def test_rank_modify_inplace(self):

# GH15630, pct should be on 100% basis when method='dense'

@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -414,7 +416,7 @@ def test_rank_dense_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -432,7 +434,7 @@ def test_rank_min_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -450,7 +452,7 @@ def test_rank_max_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand Down

0 comments on commit ff18426

Please sign in to comment.