diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 41c946cc9a559..62cf977d8c8ac 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -935,13 +935,23 @@ In previous versions, the default value for the ``ordered`` parameter was ``Fals New behavior: -.. ipython:: python +.. code-block:: ipython - from pandas.api.types import CategoricalDtype - cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) - cat - cdt = CategoricalDtype(categories=list('cbad')) - cat.astype(cdt) + In [2]: from pandas.api.types import CategoricalDtype + + In [3]: cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + + In [4]: cat + Out[4]: + [a, b, c, a, b, a] + Categories (3, object): [c < b < a] + + In [5]: cdt = CategoricalDtype(categories=list('cbad')) + + In [6]: cat.astype(cdt) + Out[6]: + [a, b, c, a, b, a] + Categories (4, object): [c < b < a < d] Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e9d23cfd8efc1..bc7916ab39c40 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -761,6 +761,7 @@ Other deprecations - :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) - :meth:`Series.put` is deprecated. (:issue:`18262`) - :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`) +- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`) - :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`). .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b77a4f985067d..9a4846c98bd22 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -332,7 +332,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # sanitize input if is_categorical_dtype(values): if dtype.categories is None: - dtype = CategoricalDtype(values.categories, dtype.ordered) + dtype = CategoricalDtype(values.categories, dtype._ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): # sanitize_array coerces np.nan to a string under certain versions # of numpy @@ -355,7 +355,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) - if dtype.ordered: + if dtype._ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError("'values' is not ordered, please " @@ -368,7 +368,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, "supported at this time") # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + dtype = CategoricalDtype(categories, dtype._ordered) elif is_categorical_dtype(values): old_codes = (values._values.codes if isinstance(values, ABCSeries) @@ -433,7 +433,7 @@ def ordered(self): """ Whether the categories have an ordered relationship. """ - return self.dtype.ordered + return self.dtype._ordered @property def dtype(self) -> CategoricalDtype: @@ -847,7 +847,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, """ inplace = validate_bool_kwarg(inplace, 'inplace') if ordered is None: - ordered = self.dtype.ordered + ordered = self.dtype._ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 81e061a0fc7b4..d8d910a16e32a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -17,6 +17,13 @@ str_type = str +# GH26403: sentinel value used for the default value of ordered in the +# CategoricalDtype constructor to detect when ordered=None is explicitly passed +ordered_sentinel = object() # type: object + +# TODO(GH26403): Replace with Optional[bool] or bool +OrderedType = Union[None, bool, object] + def register_extension_dtype(cls: Type[ExtensionDtype], ) -> Type[ExtensionDtype]: @@ -214,7 +221,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): _metadata = ('categories', 'ordered') _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, categories=None, ordered: Optional[bool] = None): + def __init__(self, + categories=None, + ordered: OrderedType = ordered_sentinel): self._finalize(categories, ordered, fastpath=False) @classmethod @@ -230,7 +239,7 @@ def _from_fastpath(cls, def _from_categorical_dtype(cls, dtype: 'CategoricalDtype', categories=None, - ordered: Optional[bool] = None, + ordered: OrderedType = None, ) -> 'CategoricalDtype': if categories is ordered is None: return dtype @@ -330,11 +339,11 @@ def _from_values_or_dtype(cls, def _finalize(self, categories, - ordered: Optional[bool], + ordered: OrderedType, fastpath: bool = False, ) -> None: - if ordered is not None: + if ordered is not None and ordered is not ordered_sentinel: self.validate_ordered(ordered) if categories is not None: @@ -342,7 +351,8 @@ def _finalize(self, fastpath=fastpath) self._categories = categories - self._ordered = ordered + self._ordered = ordered if ordered is not ordered_sentinel else None + self._ordered_from_sentinel = ordered is ordered_sentinel def __setstate__(self, state: Dict[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the @@ -355,12 +365,12 @@ def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative # space for when we have unknown categories to avoid a conflict if self.categories is None: - if self.ordered: + if self._ordered: return -1 else: return -2 # We *do* want to include the real self.ordered here - return int(self._hash_categories(self.categories, self.ordered)) + return int(self._hash_categories(self.categories, self._ordered)) def __eq__(self, other: Any) -> bool: """ @@ -379,7 +389,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -388,10 +398,10 @@ def __eq__(self, other: Any) -> bool: # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self.ordered or other.ordered: + elif self._ordered or other._ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return ((self.ordered == other.ordered) and + return ((self._ordered == other._ordered) and self.categories.equals(other.categories)) else: # Neither has ordered=True; equal if both have the same categories, @@ -406,10 +416,10 @@ def __repr__(self): data = "None, " else: data = self.categories._format_data(name=self.__class__.__name__) - return tpl.format(data, self.ordered) + return tpl.format(data, self._ordered) @staticmethod - def _hash_categories(categories, ordered: Optional[bool] = True) -> int: + def _hash_categories(categories, ordered: OrderedType = True) -> int: from pandas.core.util.hashing import ( hash_array, _combine_hash_arrays, hash_tuples ) @@ -459,7 +469,7 @@ def construct_array_type(cls): return Categorical @staticmethod - def validate_ordered(ordered: bool) -> None: + def validate_ordered(ordered: OrderedType) -> None: """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -534,17 +544,25 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': msg = ('a CategoricalDtype must be passed to perform an update, ' 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) - elif dtype.categories is not None and dtype.ordered is self.ordered: - return dtype # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories if new_categories is None: new_categories = self.categories - new_ordered = dtype.ordered + new_ordered = dtype._ordered + new_ordered_from_sentinel = dtype._ordered_from_sentinel if new_ordered is None: - new_ordered = self.ordered + # maintain existing ordered if new dtype has ordered=None + new_ordered = self._ordered + if self._ordered and new_ordered_from_sentinel: + # only warn if we'd actually change the existing behavior + msg = ("Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version, which will cause the resulting categorical's " + "`ordered` attribute to change to False; `ordered=True`" + " must be explicitly passed in order to be retained") + warnings.warn(msg, FutureWarning, stacklevel=3) return CategoricalDtype(new_categories, new_ordered) @@ -556,10 +574,18 @@ def categories(self): return self._categories @property - def ordered(self) -> Optional[bool]: + def ordered(self) -> OrderedType: """ Whether the categories have an ordered relationship. """ + # TODO: remove if block when ordered=None as default is deprecated + if self._ordered_from_sentinel and self._ordered is None: + # warn when accessing ordered if ordered=None and None was not + # explicitly passed to the constructor + msg = ("Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version; `ordered=None` must be explicitly passed.") + warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @property diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bdfb854679a2c..1044f25a6bbcd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -725,7 +725,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, - ordered=dtype.ordered) + ordered=dtype._ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence diff --git a/pandas/core/series.py b/pandas/core/series.py index 9179099562832..f5f9f1ab4f9ab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,7 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _is_unorderable_exception, ensure_platform_int, is_bool, + _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type, is_hashable, is_integer, is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) @@ -170,6 +170,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if data is None: data = {} if dtype is not None: + # GH 26336: explicitly handle 'category' to avoid warning + # TODO: Remove after CategoricalDtype defaults to ordered=False + if (isinstance(dtype, str) and dtype == 'category' and + is_categorical(data)): + dtype = data.dtype + dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 30e51e62aa764..24995d1e2e492 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -618,14 +618,9 @@ def decode(obj): return Interval(obj['left'], obj['right'], obj['closed']) elif typ == 'series': dtype = dtype_for(obj['dtype']) - pd_dtype = pandas_dtype(dtype) - index = obj['index'] - result = Series(unconvert(obj['data'], dtype, obj['compress']), - index=index, - dtype=pd_dtype, - name=obj['name']) - return result + data = unconvert(obj['data'], dtype, obj['compress']) + return Series(data, index=index, dtype=dtype, name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index b8c223ab3b04e..14ad3c4d5e860 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -160,6 +160,14 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) + def test_astype_category_ordered_none_deprecated(self): + # GH 26336 + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb')) + cat = Categorical(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(FutureWarning): + cat.astype(cdt2) + def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index cf368f9980d72..ad2195af73d96 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,7 +10,8 @@ is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype, is_period, is_period_dtype, is_string_dtype) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry) + CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, + ordered_sentinel, registry) import pandas as pd from pandas import ( @@ -54,7 +55,8 @@ def test_pickle(self): class TestCategoricalDtype(Base): def create(self): - return CategoricalDtype() + # TODO(GH 26403): Remove when default ordered becomes False + return CategoricalDtype(ordered=None) def test_pickle(self): # make sure our cache is NOT pickled @@ -675,7 +677,8 @@ def test_unordered_same(self, ordered): def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) - assert result.ordered is None + with tm.assert_produces_warning(FutureWarning): + assert result.ordered is None def test_equal_but_different(self, ordered_fixture): c1 = CategoricalDtype([1, 2, 3]) @@ -804,7 +807,8 @@ def test_categorical_categories(self): @pytest.mark.parametrize('new_categories', [ list('abc'), list('cba'), list('wxyz'), None]) - @pytest.mark.parametrize('new_ordered', [True, False, None]) + @pytest.mark.parametrize('new_ordered', [ + True, False, None, ordered_sentinel]) def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): dtype = CategoricalDtype(list('abc'), ordered_fixture) new_dtype = CategoricalDtype(new_categories, new_ordered) @@ -813,11 +817,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): if expected_categories is None: expected_categories = dtype.categories - expected_ordered = new_dtype.ordered - if expected_ordered is None: + expected_ordered = new_ordered + if new_ordered is ordered_sentinel or new_ordered is None: expected_ordered = dtype.ordered - result = dtype.update_dtype(new_dtype) + # GH 26336 + if new_ordered is ordered_sentinel and ordered_fixture is True: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = dtype.update_dtype(new_dtype) + else: + result = dtype.update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered @@ -837,6 +848,14 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) + @pytest.mark.parametrize('ordered', [ordered_sentinel, None, True, False]) + def test_ordered_none_default_deprecated(self, ordered): + # GH 26403: CDT.ordered only warns if ordered is not explicitly passed + dtype = CategoricalDtype(list('abc'), ordered=ordered) + warning = FutureWarning if ordered is ordered_sentinel else None + with tm.assert_produces_warning(warning): + dtype.ordered + @pytest.mark.parametrize('dtype', [ CategoricalDtype, diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 26a4463d421a4..3b5092c901061 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -5,7 +5,7 @@ from pandas._libs import index as libindex -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import Categorical, IntervalIndex @@ -503,6 +503,16 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): expected = index tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_astype_category_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + idx = CategoricalIndex(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning): + idx.astype(cdt2) + def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 49417942a3598..663d5ae505303 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -12,12 +12,12 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import ( Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, Timestamp, date_range, isna, period_range, timedelta_range) -from pandas.api.types import CategoricalDtype from pandas.core.arrays import period_array import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -372,6 +372,35 @@ def test_constructor_categorical_dtype(self): dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) + def test_constructor_categorical_string(self): + # GH 26336: the string 'category' maintains existing CategoricalDtype + cdt = CategoricalDtype(categories=list('dabc'), ordered=True) + expected = Series(list('abcabc'), dtype=cdt) + + # Series(Categorical, dtype='category') keeps existing dtype + cat = Categorical(list('abcabc'), dtype=cdt) + result = Series(cat, dtype='category') + tm.assert_series_equal(result, expected) + + # Series(Series[Categorical], dtype='category') keeps existing dtype + result = Series(result, dtype='category') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_categorical_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + + cat = Categorical(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning, check_stacklevel=False): + Series(cat, dtype=cdt2) + + s = Series(cat) + with tm.assert_produces_warning(warning, check_stacklevel=False): + Series(s, dtype=cdt2) + def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 59566ad3232c7..287fd15ac3f08 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,10 +8,11 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel + import pandas as pd from pandas import ( Categorical, DataFrame, Index, Series, Timedelta, Timestamp, date_range) -from pandas.api.types import CategoricalDtype import pandas.util.testing as tm @@ -230,6 +231,16 @@ def test_astype_categories_deprecation_raises(self): with pytest.raises(ValueError, match="Got an unexpected"): s.astype('category', categories=['a', 'b'], ordered=True) + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_astype_category_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + s = Series(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning, check_stacklevel=False): + s.astype(cdt2) + def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items)