diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 083242cd69b74..f1158b9ad87eb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -460,6 +460,29 @@ To restore previous behavior, simply set ``expand`` to ``False``: extracted type(extracted) +.. _whatsnew_0230.api_breaking.cdt_ordered: + +Default value for the ``ordered`` parameter of ``CategoricalDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None`` to allow updating of ``categories`` without impacting ``ordered``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`) + +In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``. + +New Behavior: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + cat + cdt = CategoricalDtype(categories=list('cbad')) + cat.astype(cdt) + +Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. + +Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``. + .. _whatsnew_0230.api: Other API Changes diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..93250bdbb5054 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject): # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _dtype = CategoricalDtype() + _dtype = CategoricalDtype(ordered=False) _deprecations = frozenset(['labels']) _typ = 'categorical' @@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if fastpath: self._codes = coerce_indexer_dtype(values, categories) - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) return # null_mask indicates missing values we want to exclude from inference. @@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, full_codes[~null_mask] = codes codes = full_codes - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) @property @@ -438,7 +438,7 @@ def astype(self, dtype, copy=True): """ if is_categorical_dtype(dtype): # GH 10696/18593 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self if dtype == self.dtype: return self @@ -560,7 +560,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = CategoricalDtype._validate_categories(categories) + categories = CategoricalDtype.validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -1165,7 +1165,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype._validate_categories(state.pop( + state['_categories'] = self.dtype.validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8d3a96992757..99e4033f104db 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype): _metadata = ['categories', 'ordered'] _cache = {} - def __init__(self, categories=None, ordered=False): + def __init__(self, categories=None, ordered=None): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, categories=None, ordered=False): + def _from_fastpath(cls, categories=None, ordered=None): self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @@ -180,14 +180,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): def _finalize(self, categories, ordered, fastpath=False): - if ordered is None: - ordered = False - else: - self._validate_ordered(ordered) + if ordered is not None: + self.validate_ordered(ordered) if categories is not None: - categories = self._validate_categories(categories, - fastpath=fastpath) + categories = self.validate_categories(categories, + fastpath=fastpath) self._categories = categories self._ordered = ordered @@ -208,6 +206,17 @@ def __hash__(self): return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): + """ + Rules for CDT equality: + 1) Any CDT is equal to the string 'category' + 2) Any CDT is equal to a CDT with categories=None regardless of ordered + 3) A CDT with ordered=True is only equal to another CDT with + ordered=True and identical categories in the same order + 4) A CDT with ordered={False, None} is only equal to another CDT with + ordered={False, None} and identical categories, but same order is + not required. There is no distinction between False/None. + 5) Any other comparison returns False + """ if isinstance(other, compat.string_types): return other == self.name @@ -220,12 +229,16 @@ def __eq__(self, other): # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self.ordered: - return other.ordered and self.categories.equals(other.categories) - elif other.ordered: - return False + elif self.ordered or other.ordered: + # At least one has ordered=True; equal if both have ordered=True + # and the same values for categories in the same order. + return ((self.ordered == other.ordered) and + self.categories.equals(other.categories)) else: - # both unordered; this could probably be optimized / cached + # Neither has ordered=True; equal if both have the same categories, + # but same order is not necessary. There is no distinction between + # ordered=False and ordered=None: CDT(., False) and CDT(., None) + # will be equal if they have the same categories. return hash(self) == hash(other) def __repr__(self): @@ -288,7 +301,7 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") @staticmethod - def _validate_ordered(ordered): + def validate_ordered(ordered): """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -308,7 +321,7 @@ def _validate_ordered(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def _validate_categories(categories, fastpath=False): + def validate_categories(categories, fastpath=False): """ Validates that we have good categories @@ -340,7 +353,7 @@ def _validate_categories(categories, fastpath=False): return categories - def _update_dtype(self, dtype): + def update_dtype(self, dtype): """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -361,11 +374,16 @@ def _update_dtype(self, dtype): 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) - # dtype is CDT: keep current categories if None (ordered can't be None) + # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories if new_categories is None: new_categories = self.categories - return CategoricalDtype(new_categories, dtype.ordered) + + new_ordered = dtype.ordered + if new_ordered is None: + new_ordered = self.ordered + + return CategoricalDtype(new_categories, new_ordered) @property def categories(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b36bc1df23247..60f5552576ea1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -344,7 +344,7 @@ def astype(self, dtype, copy=True): return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d800a7b92b559..cc833af03ae66 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -24,6 +24,11 @@ import pandas.util.testing as tm +@pytest.fixture(params=[True, False, None]) +def ordered(request): + return request.param + + class Base(object): def setup_method(self, method): @@ -124,41 +129,6 @@ def test_tuple_categories(self): result = CategoricalDtype(categories) assert all(result.categories == categories) - @pytest.mark.parametrize('dtype', [ - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True)]) - @pytest.mark.parametrize('new_dtype', [ - 'category', - CategoricalDtype(None, False), - CategoricalDtype(None, True), - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True), - CategoricalDtype(list('cba'), False), - CategoricalDtype(list('cba'), True), - CategoricalDtype(list('wxyz'), False), - CategoricalDtype(list('wxyz'), True)]) - def test_update_dtype(self, dtype, new_dtype): - if isinstance(new_dtype, string_types) and new_dtype == 'category': - expected_categories = dtype.categories - expected_ordered = dtype.ordered - else: - expected_categories = new_dtype.categories - if expected_categories is None: - expected_categories = dtype.categories - expected_ordered = new_dtype.ordered - - result = dtype._update_dtype(new_dtype) - tm.assert_index_equal(result.categories, expected_categories) - assert result.ordered is expected_ordered - - @pytest.mark.parametrize('bad_dtype', [ - 'foo', object, np.int64, PeriodDtype('Q')]) - def test_update_dtype_errors(self, bad_dtype): - dtype = CategoricalDtype(list('abc'), False) - msg = 'a CategoricalDtype must be passed to perform an update, ' - with tm.assert_raises_regex(ValueError, msg): - dtype._update_dtype(bad_dtype) - class TestDatetimeTZDtype(Base): @@ -609,17 +579,12 @@ def test_caching(self): class TestCategoricalDtypeParametrized(object): - @pytest.mark.parametrize('categories, ordered', [ - (['a', 'b', 'c', 'd'], False), - (['a', 'b', 'c', 'd'], True), - (np.arange(1000), False), - (np.arange(1000), True), - (['a', 'b', 10, 2, 1.3, True], False), - ([True, False], True), - ([True, False], False), - (pd.date_range('2017', periods=4), True), - (pd.date_range('2017', periods=4), False), - ]) + @pytest.mark.parametrize('categories', [ + list('abcd'), + np.arange(1000), + ['a', 'b', 10, 2, 1.3, True], + [True, False], + pd.date_range('2017', periods=4)]) def test_basic(self, categories, ordered): c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) @@ -627,21 +592,24 @@ def test_basic(self, categories, ordered): def test_order_matters(self): categories = ['a', 'b'] - c1 = CategoricalDtype(categories, ordered=False) - c2 = CategoricalDtype(categories, ordered=True) + c1 = CategoricalDtype(categories, ordered=True) + c2 = CategoricalDtype(categories, ordered=False) + c3 = CategoricalDtype(categories, ordered=None) assert c1 is not c2 + assert c1 is not c3 - def test_unordered_same(self): - c1 = CategoricalDtype(['a', 'b']) - c2 = CategoricalDtype(['b', 'a']) + @pytest.mark.parametrize('ordered', [False, None]) + def test_unordered_same(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) + c2 = CategoricalDtype(['b', 'a'], ordered=ordered) assert hash(c1) == hash(c2) def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) - assert result.ordered is False + assert result.ordered is None - def test_equal_but_different(self): + def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1., 2., 3.]) assert c1 is not c2 @@ -652,9 +620,11 @@ def test_equal_but_different(self): ([1, 2, 3], [3, 2, 1]), ]) def test_order_hashes_different(self, v1, v2): - c1 = CategoricalDtype(v1) + c1 = CategoricalDtype(v1, ordered=False) c2 = CategoricalDtype(v2, ordered=True) + c3 = CategoricalDtype(v1, ordered=None) assert c1 is not c2 + assert c1 is not c3 def test_nan_invalid(self): with pytest.raises(ValueError): @@ -669,26 +639,46 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(['b', 'a'], ordered=True) assert c1 is not c2 - @pytest.mark.parametrize('ordered, other, expected', [ - (True, CategoricalDtype(['a', 'b'], True), True), - (False, CategoricalDtype(['a', 'b'], False), True), - (True, CategoricalDtype(['a', 'b'], False), False), - (False, CategoricalDtype(['a', 'b'], True), False), - (True, CategoricalDtype([1, 2], False), False), - (False, CategoricalDtype([1, 2], True), False), - (False, CategoricalDtype(None, True), True), - (True, CategoricalDtype(None, True), True), - (False, CategoricalDtype(None, False), True), - (True, CategoricalDtype(None, False), True), - (True, 'category', True), - (False, 'category', True), - (True, 'not a category', False), - (False, 'not a category', False), - ]) - def test_categorical_equality(self, ordered, other, expected): - c1 = CategoricalDtype(['a', 'b'], ordered) + @pytest.mark.parametrize('ordered1', [True, False, None]) + @pytest.mark.parametrize('ordered2', [True, False, None]) + def test_categorical_equality(self, ordered1, ordered2): + # same categories, same order + # any combination of None/False are equal + # True/True is the only combination with True that are equal + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('abc'), ordered2) + result = c1 == c2 + expected = bool(ordered1) is bool(ordered2) + assert result is expected + + # same categories, different order + # any combination of None/False are equal (order doesn't matter) + # any combination with True are not equal (different order of cats) + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('cab'), ordered2) + result = c1 == c2 + expected = (bool(ordered1) is False) and (bool(ordered2) is False) + assert result is expected + + # different categories + c2 = CategoricalDtype([1, 2, 3], ordered2) + assert c1 != c2 + + # none categories + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(None, ordered2) + c3 = CategoricalDtype(None, ordered1) + assert c1 == c2 + assert c2 == c1 + assert c2 == c3 + + @pytest.mark.parametrize('categories', [list('abc'), None]) + @pytest.mark.parametrize('other', ['category', 'not a category']) + def test_categorical_equality_strings(self, categories, ordered, other): + c1 = CategoricalDtype(categories, ordered) result = c1 == other - assert result == expected + expected = other == 'category' + assert result is expected def test_invalid_raises(self): with tm.assert_raises_regex(TypeError, 'ordered'): @@ -729,12 +719,12 @@ def test_from_categorical_dtype_both(self): c1, categories=[1, 2], ordered=False) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self): - c1 = CategoricalDtype(['a', 'b']) + def test_str_vs_repr(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) assert str(c1) == 'category' # Py2 will have unicode prefixes - pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)" - assert re.match(pat, repr(c1)) + pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" + assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): # GH17884 @@ -742,3 +732,38 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + + @pytest.mark.parametrize('new_categories', [ + list('abc'), list('cba'), list('wxyz'), None]) + @pytest.mark.parametrize('new_ordered', [True, False, None]) + def test_update_dtype(self, ordered, new_categories, new_ordered): + dtype = CategoricalDtype(list('abc'), ordered) + new_dtype = CategoricalDtype(new_categories, new_ordered) + + expected_categories = new_dtype.categories + if expected_categories is None: + expected_categories = dtype.categories + + expected_ordered = new_dtype.ordered + if expected_ordered is None: + expected_ordered = dtype.ordered + + result = dtype.update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered + + def test_update_dtype_string(self, ordered): + dtype = CategoricalDtype(list('abc'), ordered) + expected_categories = dtype.categories + expected_ordered = dtype.ordered + result = dtype.update_dtype('category') + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered + + @pytest.mark.parametrize('bad_dtype', [ + 'foo', object, np.int64, PeriodDtype('Q')]) + def test_update_dtype_errors(self, bad_dtype): + dtype = CategoricalDtype(list('abc'), False) + msg = 'a CategoricalDtype must be passed to perform an update, ' + with tm.assert_raises_regex(ValueError, msg): + dtype.update_dtype(bad_dtype)