diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 065a5782aced16..9a2363c31d066a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -859,6 +859,17 @@ def rank(values, axis=0, method='average', na_option='keep', Whether or not to the display the returned rankings in integer form (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ + if is_object_dtype(values): + def raise_non_numeric_error(): + raise ValueError("pandas.core.algorithms.rank " + "not supported for unordered " + "non-numeric data") + if is_categorical_dtype(values): + if not values.ordered: + raise_non_numeric_error() + else: + raise_non_numeric_error() + if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) ranks = f(values, ties_method=method, ascending=ascending, diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index b8ba408b547158..785b470d6ef48a 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -71,23 +71,22 @@ def test_rank2(self): result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) - df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) - expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) - result = df.rank(1, numeric_only=False) - tm.assert_frame_equal(result, expected) + # See #19560 + error_msg = ("pandas.core.algorithms.rank " + "not supported for unordered " + "non-numeric data") - expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) - result = df.rank(0, numeric_only=False) - tm.assert_frame_equal(result, expected) + df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) + with tm.assert_raises_regex(ValueError, error_msg): + df.rank(1, numeric_only=False) + with tm.assert_raises_regex(ValueError, error_msg): + df.rank(0, numeric_only=False) df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) - expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) - result = df.rank(1, numeric_only=False) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) - result = df.rank(0, numeric_only=False) - tm.assert_frame_equal(result, expected) + with tm.assert_raises_regex(ValueError, error_msg): + df.rank(1, numeric_only=False) + with tm.assert_raises_regex(ValueError, error_msg): + df.rank(0, numeric_only=False) # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], @@ -110,9 +109,9 @@ def test_rank2(self): self.mixed_frame['datetime'] = datetime.now() self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) - result = self.mixed_frame.rank(1) - expected = self.mixed_frame.rank(1, numeric_only=True) - tm.assert_frame_equal(result, expected) + # mixed_frame["foo"] is of string-type + with tm.assert_raises_regex(ValueError, error_msg): + self.mixed_frame.rank(1) df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) @@ -218,7 +217,7 @@ def test_rank_methods_frame(self): tm.assert_frame_equal(result, expected) def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] + dtypes = ['f8', 'i8'] for dtype, method in product(dtypes, self.results): if 'i' in dtype: @@ -230,15 +229,11 @@ def test_rank_descending(self): expected = (df.max() - df).rank() assert_frame_equal(res, expected) - if method == 'first' and dtype == 'O': - continue - expected = (df.max() - df).rank(method=method) - if dtype != 'O': - res2 = df.rank(method=method, ascending=False, - numeric_only=True) - assert_frame_equal(res2, expected) + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) res3 = df.rank(method=method, ascending=False, numeric_only=False) @@ -258,11 +253,10 @@ def _check2d(df, expected, method='average', axis=0): assert_frame_equal(result, exp_df) dtypes = [None, object] - disabled = set([(object, 'first')]) results = self.results for method, axis, dtype in product(results, [0, 1], dtypes): - if (dtype, method) in disabled: + if dtype == object: continue frame = df if dtype is None else df.astype(dtype) _check2d(frame, results[method], method=method, axis=axis) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 004e42e14cb93f..25853e6a19fb35 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -134,22 +134,27 @@ def test_rank_categorical(self): assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) - # Unordered categoricals should be ranked as objects + # See #19560 + error_msg = ("pandas.core.algorithms.rank " + "not supported for unordered " + "non-numeric data") + + # Ranking unordered categorials depreciated per #19560 unordered = Series(['first', 'second', 'third', 'fourth', 'fifth', 'sixth']).astype( CategoricalDtype(categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False)) - exp_unordered = Series([2., 4., 6., 3., 1., 5.]) - res = unordered.rank() - assert_series_equal(res, exp_unordered) + + with tm.assert_raises_regex(ValueError, error_msg): + unordered.rank() unordered1 = Series( [1, 2, 3, 4, 5, 6], ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) - exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) - res1 = unordered1.rank() - assert_series_equal(res1, exp_unordered1) + + # Won't raise ValueError because entries not objects. + unordered1.rank() # Test na_option for rank data na_ser = Series( @@ -213,16 +218,13 @@ def test_rank_signature(self): 'int64', marks=pytest.mark.xfail( reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue #16674")), - ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], - 'object') + "int64 pending issue #16674")) ]) def test_rank_inf(self, contents, dtype): dtype_na_map = { 'float64': np.nan, 'float32': np.nan, - 'int64': iNaT, - 'object': None + 'int64': iNaT } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly @@ -249,13 +251,10 @@ def _check(s, expected, method='average'): result = s.rank(method=method) tm.assert_series_equal(result, Series(expected)) - dtypes = [None, object] - disabled = set([(object, 'first')]) + dtypes = [None] results = self.results for method, dtype in product(results, dtypes): - if (dtype, method) in disabled: - continue series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) @@ -294,7 +293,7 @@ def _check(s, method, na_option, ascending): for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) - if (dtype, method) in disabled: + if dtype == 'object': continue _check(iseries, method, na_option, ascending) @@ -330,7 +329,7 @@ def test_rank_methods_series(self): tm.assert_series_equal(result, expected) def test_rank_dense_method(self): - dtypes = ['O', 'f8', 'i8'] + dtypes = ['f8', 'i8'] in_out = [([1], [1]), ([2], [1]), ([0], [1]), @@ -348,7 +347,7 @@ def test_rank_dense_method(self): assert_series_equal(result, expected) def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] + dtypes = ['f8', 'i8'] for dtype, method in product(dtypes, self.results): if 'i' in dtype: @@ -360,9 +359,6 @@ def test_rank_descending(self): expected = (s.max() - s).rank() assert_series_equal(res, expected) - if method == 'first' and dtype == 'O': - continue - expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) assert_series_equal(res2, expected) @@ -379,9 +375,15 @@ def test_rank_int(self): def test_rank_object_bug(self): # GH 13445 - # smoke tests - Series([np.nan] * 32).astype(object).rank(ascending=True) - Series([np.nan] * 32).astype(object).rank(ascending=False) + # See #19560 + error_msg = ("pandas.core.algorithms.rank " + "not supported for unordered " + "non-numeric data") + + with tm.assert_raises_regex(ValueError, error_msg): + Series([np.nan] * 32).astype(object).rank(ascending=True) + with tm.assert_raises_regex(ValueError, error_msg): + Series([np.nan] * 32).astype(object).rank(ascending=False) def test_rank_modify_inplace(self): # GH 18521 @@ -396,7 +398,7 @@ def test_rank_modify_inplace(self): # GH15630, pct should be on 100% basis when method='dense' -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('dtype', ['f8', 'i8']) @pytest.mark.parametrize('ser, exp', [ ([1], [1.]), ([1, 2], [1. / 2, 2. / 2]), @@ -414,7 +416,7 @@ def test_rank_dense_pct(dtype, ser, exp): assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('dtype', ['f8', 'i8']) @pytest.mark.parametrize('ser, exp', [ ([1], [1.]), ([1, 2], [1. / 2, 2. / 2]), @@ -432,7 +434,7 @@ def test_rank_min_pct(dtype, ser, exp): assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('dtype', ['f8', 'i8']) @pytest.mark.parametrize('ser, exp', [ ([1], [1.]), ([1, 2], [1. / 2, 2. / 2]), @@ -450,7 +452,7 @@ def test_rank_max_pct(dtype, ser, exp): assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('dtype', ['f8', 'i8']) @pytest.mark.parametrize('ser, exp', [ ([1], [1.]), ([1, 2], [1. / 2, 2. / 2]),