From 2511d20f698ceb4a5ae7bb4c7a967061dbd164d2 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 9 Dec 2019 02:10:55 -0700 Subject: [PATCH] DEPR: Change default value for CategoricalDtype.ordered from None to False (#29955) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/categorical.py | 10 +- pandas/core/construction.py | 2 +- pandas/core/dtypes/dtypes.py | 101 +++++++++--------- pandas/core/series.py | 10 -- .../tests/arrays/categorical/test_dtypes.py | 8 -- pandas/tests/dtypes/test_dtypes.py | 50 ++------- pandas/tests/indexes/test_category.py | 13 +-- pandas/tests/series/test_constructors.py | 18 +--- pandas/tests/series/test_dtypes.py | 13 +-- pandas/tests/series/test_io.py | 9 -- 11 files changed, 67 insertions(+), 168 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6cdb9221a7a6a1..0d83d74b5e0c15 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -622,6 +622,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Changed :meth:`Timedelta.resolution` to match the behavior of the standard library ``datetime.timedelta.resolution``, for the old behavior, use :meth:`Timedelta.resolution_string` (:issue:`26839`) - Removed previously deprecated :attr:`Timestamp.weekday_name`, :attr:`DatetimeIndex.weekday_name`, and :attr:`Series.dt.weekday_name` (:issue:`18164`) - Removed previously deprecated ``errors`` argument in :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` (:issue:`22644`) +- Changed the default value for ``ordered`` in :class:`CategoricalDtype` from ``None`` to ``False`` (:issue:`26336`) - :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` now require "labels" as the first argument and "axis" as an optional named parameter (:issue:`30089`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a2e456581cb4fb..1e470e44ed9333 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -328,7 +328,7 @@ def __init__( # sanitize input if is_categorical_dtype(values): if dtype.categories is None: - dtype = CategoricalDtype(values.categories, dtype._ordered) + dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): # sanitize_array coerces np.nan to a string under certain versions # of numpy @@ -351,7 +351,7 @@ def __init__( codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) - if dtype._ordered: + if dtype.ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError( @@ -367,7 +367,7 @@ def __init__( ) # we're inferring from values - dtype = CategoricalDtype(categories, dtype._ordered) + dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values): old_codes = ( @@ -437,7 +437,7 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. """ - return self.dtype._ordered + return self.dtype.ordered @property def dtype(self) -> CategoricalDtype: @@ -833,7 +833,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal """ inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: - ordered = self.dtype._ordered + ordered = self.dtype.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b03c69d8653013..c7dec9e1234d22 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -558,7 +558,7 @@ def _try_cast( # that Categorical is the only array type for 'category'. dtype = cast(CategoricalDtype, dtype) subarr = dtype.construct_array_type()( - arr, dtype.categories, ordered=dtype._ordered + arr, dtype.categories, ordered=dtype.ordered ) elif is_extension_array_dtype(dtype): # create an extension array from its dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 9e16ba670344ee..2bb27de320e7ed 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,7 +1,6 @@ """ define extension dtypes """ import re from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast -import warnings import numpy as np import pytz @@ -18,10 +17,6 @@ str_type = str -# GH26403: sentinel value used for the default value of ordered in the -# CategoricalDtype constructor to detect when ordered=None is explicitly passed -ordered_sentinel: object = object() - def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: """ @@ -179,7 +174,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ---------- categories : sequence, optional Must be unique, and must not contain any nulls. - ordered : bool, default False + ordered : bool or None, default False + Whether or not this categorical is treated as a ordered categorical. + None can be used to maintain the ordered value of existing categoricals when + used in operations that combine categoricals, e.g. astype, and will resolve to + False if there is no existing ordered to maintain. Attributes ---------- @@ -218,14 +217,10 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): kind: str_type = "O" str = "|O08" base = np.dtype("O") - _metadata = ("categories", "ordered", "_ordered_from_sentinel") + _metadata = ("categories", "ordered") _cache: Dict[str_type, PandasExtensionDtype] = {} - def __init__( - self, categories=None, ordered: Union[Ordered, object] = ordered_sentinel - ): - # TODO(GH26403): Set type of ordered to Ordered - ordered = cast(Ordered, ordered) + def __init__(self, categories=None, ordered: Ordered = False): self._finalize(categories, ordered, fastpath=False) @classmethod @@ -338,17 +333,45 @@ def _from_values_or_dtype( return dtype + @classmethod + def construct_from_string(cls, string: str_type) -> "CategoricalDtype": + """ + Construct a CategoricalDtype from a string. + + Parameters + ---------- + string : str + Must be the string "category" in order to be successfully constructed. + + Returns + ------- + CategoricalDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a CategoricalDtype cannot be constructed from the input. + """ + if not isinstance(string, str): + raise TypeError(f"Expects a string, got {type(string)}") + if string != cls.name: + raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") + + # need ordered=None to ensure that operations specifying dtype="category" don't + # override the ordered value for existing categoricals + return cls(ordered=None) + def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: - if ordered is not None and ordered is not ordered_sentinel: + if ordered is not None: self.validate_ordered(ordered) if categories is not None: categories = self.validate_categories(categories, fastpath=fastpath) self._categories = categories - self._ordered = ordered if ordered is not ordered_sentinel else None - self._ordered_from_sentinel = ordered is ordered_sentinel + self._ordered = ordered def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the @@ -356,18 +379,17 @@ def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: # pickle -> need to set the settable private ones here (see GH26067) self._categories = state.pop("categories", None) self._ordered = state.pop("ordered", False) - self._ordered_from_sentinel = state.pop("_ordered_from_sentinel", False) def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative # space for when we have unknown categories to avoid a conflict if self.categories is None: - if self._ordered: + if self.ordered: return -1 else: return -2 # We *do* want to include the real self.ordered here - return int(self._hash_categories(self.categories, self._ordered)) + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other: Any) -> bool: """ @@ -386,7 +408,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, "_ordered") and hasattr(other, "categories")): + elif not (hasattr(other, "ordered") and hasattr(other, "categories")): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -395,10 +417,10 @@ def __eq__(self, other: Any) -> bool: # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self._ordered or other._ordered: + elif self.ordered or other.ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return (self._ordered == other._ordered) and self.categories.equals( + return (self.ordered == other.ordered) and self.categories.equals( other.categories ) else: @@ -420,7 +442,7 @@ def __repr__(self) -> str_type: data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - return tpl.format(data=data, ordered=self._ordered) + return tpl.format(data=data, ordered=self.ordered) @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -557,26 +579,11 @@ def update_dtype( # from here on, dtype is a CategoricalDtype dtype = cast(CategoricalDtype, dtype) - # dtype is CDT: keep current categories/ordered if None - new_categories = dtype.categories - if new_categories is None: - new_categories = self.categories - - new_ordered = dtype._ordered - new_ordered_from_sentinel = dtype._ordered_from_sentinel - if new_ordered is None: - # maintain existing ordered if new dtype has ordered=None - new_ordered = self._ordered - if self._ordered and new_ordered_from_sentinel: - # only warn if we'd actually change the existing behavior - msg = ( - "Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version, which will cause the resulting categorical's " - "`ordered` attribute to change to False; `ordered=True` " - "must be explicitly passed in order to be retained" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) + # update categories/ordered unless they've been explicitly passed as None + new_categories = ( + dtype.categories if dtype.categories is not None else self.categories + ) + new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered return CategoricalDtype(new_categories, new_ordered) @@ -592,16 +599,6 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. """ - # TODO: remove if block when ordered=None as default is deprecated - if self._ordered_from_sentinel and self._ordered is None: - # warn when accessing ordered if ordered=None and None was not - # explicitly passed to the constructor - msg = ( - "Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version; `ordered=None` must be explicitly passed." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @property diff --git a/pandas/core/series.py b/pandas/core/series.py index 9d75025ebcb1a1..965736a097c218 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -20,7 +20,6 @@ _is_unorderable_exception, ensure_platform_int, is_bool, - is_categorical, is_categorical_dtype, is_datetime64_dtype, is_dict_like, @@ -213,15 +212,6 @@ def __init__( if data is None: data = {} if dtype is not None: - # GH 26336: explicitly handle 'category' to avoid warning - # TODO: Remove after CategoricalDtype defaults to ordered=False - if ( - isinstance(dtype, str) - and dtype == "category" - and is_categorical(data) - ): - dtype = data.dtype - dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index c08ad1da386718..85bf385b029a36 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -161,14 +161,6 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) - def test_astype_category_ordered_none_deprecated(self): - # GH 26336 - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb")) - cat = Categorical(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(FutureWarning): - cat.astype(cdt2) - def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index bbf44006611fba..6c6ff3272c0125 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -22,7 +22,6 @@ DatetimeTZDtype, IntervalDtype, PeriodDtype, - ordered_sentinel, registry, ) @@ -65,8 +64,7 @@ def test_pickle(self): class TestCategoricalDtype(Base): def create(self): - # TODO(GH 26403): Remove when default ordered becomes False - return CategoricalDtype(ordered=None) + return CategoricalDtype() def test_pickle(self): # make sure our cache is NOT pickled @@ -721,8 +719,7 @@ def test_unordered_same(self, ordered): def test_categories(self): result = CategoricalDtype(["a", "b", "c"]) tm.assert_index_equal(result.categories, pd.Index(["a", "b", "c"])) - with tm.assert_produces_warning(FutureWarning): - assert result.ordered is None + assert result.ordered is False def test_equal_but_different(self, ordered_fixture): c1 = CategoricalDtype([1, 2, 3]) @@ -847,25 +844,15 @@ def test_categorical_categories(self): @pytest.mark.parametrize( "new_categories", [list("abc"), list("cba"), list("wxyz"), None] ) - @pytest.mark.parametrize("new_ordered", [True, False, None, ordered_sentinel]) + @pytest.mark.parametrize("new_ordered", [True, False, None]) def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): - dtype = CategoricalDtype(list("abc"), ordered_fixture) + original_categories = list("abc") + dtype = CategoricalDtype(original_categories, ordered_fixture) new_dtype = CategoricalDtype(new_categories, new_ordered) - expected_categories = new_dtype.categories - if expected_categories is None: - expected_categories = dtype.categories - - expected_ordered = new_ordered - if new_ordered is ordered_sentinel or new_ordered is None: - expected_ordered = dtype.ordered - - # GH 26336 - if new_ordered is ordered_sentinel and ordered_fixture is True: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = dtype.update_dtype(new_dtype) - else: - result = dtype.update_dtype(new_dtype) + result = dtype.update_dtype(new_dtype) + expected_categories = pd.Index(new_categories or original_categories) + expected_ordered = new_ordered if new_ordered is not None else dtype.ordered tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered @@ -885,27 +872,6 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) - @pytest.mark.parametrize("ordered", [ordered_sentinel, None, True, False]) - def test_ordered_none_default_deprecated(self, ordered): - # GH 26403: CDT.ordered only warns if ordered is not explicitly passed - dtype = CategoricalDtype(list("abc"), ordered=ordered) - warning = FutureWarning if ordered is ordered_sentinel else None - with tm.assert_produces_warning(warning): - dtype.ordered - - @pytest.mark.parametrize("ordered", [True, False, None, ordered_sentinel]) - def test_pickle_ordered_from_sentinel(self, ordered): - # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) - dtype = CategoricalDtype(categories=list("abc"), ordered=ordered) - - warning = FutureWarning if ordered is ordered_sentinel else None - with tm.assert_produces_warning(warning, check_stacklevel=False): - dtype_from_pickle = tm.round_trip_pickle(dtype) - - result = dtype_from_pickle._ordered_from_sentinel - expected = ordered is ordered_sentinel - assert result is expected - @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 86219d77542af6..7286fca42848c4 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -5,7 +5,7 @@ from pandas._libs import index as libindex -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import Categorical, IntervalIndex @@ -525,17 +525,6 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): expected = index tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_astype_category_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - idx = CategoricalIndex(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning): - idx.astype(cdt2) - def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 293ec9580436ef..c772038619db0a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -9,7 +9,7 @@ from pandas._libs.tslib import iNaT from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -401,22 +401,6 @@ def test_constructor_categorical_string(self): result = Series(result, dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_categorical_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - - cat = Categorical(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning, check_stacklevel=False): - Series(cat, dtype=cdt2) - - s = Series(cat) - with tm.assert_produces_warning(warning, check_stacklevel=False): - Series(s, dtype=cdt2) - def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 22b00425abb6b4..ff4842791b4fd6 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,7 +8,7 @@ from pandas._libs.tslibs import iNaT -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -219,17 +219,6 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_astype_category_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - s = Series(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning, check_stacklevel=False): - s.astype(cdt2) - def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index cd32b2188b892a..9041d582b19ca0 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -216,15 +216,6 @@ def test_pickle_preserve_name(self): unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) assert unpickled.name == n - def test_pickle_categorical_ordered_from_sentinel(self): - # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) - s = Series(["a", "b", "c", "a"], dtype="category") - result = tm.round_trip_pickle(s) - result = result.astype("category") - - tm.assert_series_equal(result, s) - assert result.dtype._ordered_from_sentinel is False - def _pickle_roundtrip_name(self, obj): with tm.ensure_clean() as path: