Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Deprecate ordered=None for CategoricalDtype #26403

Merged
merged 14 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,7 @@ Other deprecations
- :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`)
- :meth:`Series.put` is deprecated. (:issue:`18262`)
- :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`)
- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`)

.. _whatsnew_0250.prior_deprecations:

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
# sanitize input
if is_categorical_dtype(values):
if dtype.categories is None:
dtype = CategoricalDtype(values.categories, dtype.ordered)
dtype = CategoricalDtype(values.categories, dtype._ordered)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we really need to user the internal one here? (and all others); I really don't want to expose that even to our code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the overview that Jeremy gave above (#26403 (comment)) and this comment for more details on why _ordered was needed: #26403 (comment)
That explains clearly the context on why this was added, and I am fine with it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that's fine then, @jschendel can you create an issue for this so we don't forget at removal time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, back at work today so will create the issue later on tonight

elif not isinstance(values, (ABCIndexClass, ABCSeries)):
# sanitize_array coerces np.nan to a string under certain versions
# of numpy
Expand All @@ -354,7 +354,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
codes, categories = factorize(values, sort=True)
except TypeError:
codes, categories = factorize(values, sort=False)
if dtype.ordered:
if dtype._ordered:
# raise, as we don't have a sortable data structure and so
# the user should give us one by specifying categories
raise TypeError("'values' is not ordered, please "
Expand All @@ -367,7 +367,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
"supported at this time")

# we're inferring from values
dtype = CategoricalDtype(categories, dtype.ordered)
dtype = CategoricalDtype(categories, dtype._ordered)

elif is_categorical_dtype(values):
old_codes = (values._values.codes if isinstance(values, ABCSeries)
Expand Down Expand Up @@ -432,7 +432,7 @@ def ordered(self):
"""
Whether the categories have an ordered relationship.
"""
return self.dtype.ordered
return self.dtype._ordered

@property
def dtype(self) -> CategoricalDtype:
Expand Down Expand Up @@ -846,7 +846,7 @@ def set_categories(self, new_categories, ordered=None, rename=False,
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if ordered is None:
ordered = self.dtype.ordered
ordered = self.dtype._ordered
new_dtype = CategoricalDtype(new_categories, ordered=ordered)

cat = self if inplace else self.copy()
Expand Down
48 changes: 35 additions & 13 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

str_type = str

# sentinel value used for the default value of ordered in the CategoricalDtype
# constructor to detect when ordered=None is explicitly passed (GH 26403)
sentinel = object() # type: object
jschendel marked this conversation as resolved.
Show resolved Hide resolved


def register_extension_dtype(cls: Type[ExtensionDtype],
) -> Type[ExtensionDtype]:
Expand Down Expand Up @@ -214,7 +218,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
_metadata = ('categories', 'ordered')
_cache = {} # type: Dict[str_type, PandasExtensionDtype]

def __init__(self, categories=None, ordered: Optional[bool] = None):
def __init__(self,
categories=None,
ordered: Union[None, bool, object] = sentinel):
self._finalize(categories, ordered, fastpath=False)

@classmethod
Expand Down Expand Up @@ -334,15 +340,16 @@ def _finalize(self,
fastpath: bool = False,
) -> None:

if ordered is not None:
if ordered is not None and ordered is not sentinel:
self.validate_ordered(ordered)

if categories is not None:
categories = self.validate_categories(categories,
fastpath=fastpath)

self._categories = categories
self._ordered = ordered
self._ordered = ordered if ordered is not sentinel else None
self._ordered_from_sentinel = ordered is sentinel
Copy link
Member Author

@jschendel jschendel Jul 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little bit gross but seemed to be the cleanest way to implement things in a non-intrusive way (i.e. only warning when behavior would silently change).

Internal code can't reference .ordered because it could trigger the warning due to under the hood operations that the user has no control over (e.g. handling 'category' as dtype), so internal code needs to reference ._ordered. Instead of special casing sentinel every time ._ordered is referenced seemed cleaner to add an additional attribute to keep track of things.

I tried to change the internals so that .ordered didn't trigger the warning, but I wasn't successfully able to do so, and my failing attempt had already gotten more complex than the .ordered --> ._ordered change, so didn't seem worthwhile to pursue further.

Note that the .ordered --> ._ordered change only applies to CategoricalDtype; this change does not appear to be needed to Categorical or CategoricalIndex, as those will always resolve to a boolean.

I'll volunteer to remove this and revert _ordered to ordered when we go through with changing the default value in the constructor.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll volunteer to remove this and revert _ordered to ordered when we go through with changing the default value in the constructor.

Showtime!


def __setstate__(self, state: Dict[str_type, Any]) -> None:
# for pickle compat. __get_state__ is defined in the
Expand All @@ -355,12 +362,12 @@ def __hash__(self) -> int:
# _hash_categories returns a uint64, so use the negative
# space for when we have unknown categories to avoid a conflict
if self.categories is None:
if self.ordered:
if self._ordered:
return -1
else:
return -2
# We *do* want to include the real self.ordered here
return int(self._hash_categories(self.categories, self.ordered))
return int(self._hash_categories(self.categories, self._ordered))

def __eq__(self, other: Any) -> bool:
"""
Expand All @@ -379,7 +386,7 @@ def __eq__(self, other: Any) -> bool:
return other == self.name
elif other is self:
return True
elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')):
return False
elif self.categories is None or other.categories is None:
# We're forced into a suboptimal corner thanks to math and
Expand All @@ -388,10 +395,10 @@ def __eq__(self, other: Any) -> bool:
# CDT(., .) = CDT(None, False) and *all*
# CDT(., .) = CDT(None, True).
return True
elif self.ordered or other.ordered:
elif self._ordered or other._ordered:
# At least one has ordered=True; equal if both have ordered=True
# and the same values for categories in the same order.
return ((self.ordered == other.ordered) and
return ((self._ordered == other._ordered) and
self.categories.equals(other.categories))
else:
# Neither has ordered=True; equal if both have the same categories,
Expand All @@ -406,7 +413,7 @@ def __repr__(self):
data = "None, "
else:
data = self.categories._format_data(name=self.__class__.__name__)
return tpl.format(data, self.ordered)
return tpl.format(data, self._ordered)

@staticmethod
def _hash_categories(categories, ordered: Optional[bool] = True) -> int:
Expand Down Expand Up @@ -534,17 +541,24 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
msg = ('a CategoricalDtype must be passed to perform an update, '
'got {dtype!r}').format(dtype=dtype)
raise ValueError(msg)
elif dtype.categories is not None and dtype.ordered is self.ordered:
return dtype

# dtype is CDT: keep current categories/ordered if None
new_categories = dtype.categories
if new_categories is None:
new_categories = self.categories

new_ordered = dtype.ordered
new_ordered = dtype._ordered
new_ordered_from_sentinel = dtype._ordered_from_sentinel
if new_ordered is None:
new_ordered = self.ordered
# maintain existing ordered if new dtype has ordered=None
new_ordered = self._ordered
if self._ordered and new_ordered_from_sentinel:
# only warn if we'd actually change the existing behavior
msg = ("Constructing a CategoricalDtype without specifying "
"`ordered` will default to `ordered=False` in a future "
"version; `ordered=True` must be explicitly passed in "
"order to be retained")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we explain what the use case is here? (maybe we don't really want to encourage using it ..)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added some more detail to the message, but agree that we don't really want to encourage using ordered=None so I didn't specifically mention it

warnings.warn(msg, FutureWarning, stacklevel=3)

return CategoricalDtype(new_categories, new_ordered)

Expand All @@ -560,6 +574,14 @@ def ordered(self) -> Optional[bool]:
"""
Whether the categories have an ordered relationship.
"""
# TODO: remove if block when ordered=None as default is deprecated
if self._ordered_from_sentinel and self._ordered is None:
# warn when accessing ordered if ordered=None and None was not
# explicitly passed to the constructor
msg = ("Constructing a CategoricalDtype without specifying "
"`ordered` will default to `ordered=False` in a future "
"version; `ordered=None` must be explicitly passed.")
warnings.warn(msg, FutureWarning, stacklevel=2)
return self._ordered

@property
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure):
# We *do* allow casting to categorical, since we know
# that Categorical is the only array type for 'category'.
subarr = Categorical(arr, dtype.categories,
ordered=dtype.ordered)
ordered=dtype._ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
_is_unorderable_exception, ensure_platform_int, is_bool,
_is_unorderable_exception, ensure_platform_int, is_bool, is_categorical,
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
Expand Down Expand Up @@ -169,6 +169,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
if data is None:
data = {}
if dtype is not None:
# GH 26336: explicitly handle 'category' to avoid warning
# TODO: Remove after CategoricalDtype defaults to ordered=False
if (isinstance(dtype, str) and dtype == 'category' and
is_categorical(data)):
dtype = data.dtype

dtype = self._validate_dtype(dtype)

if isinstance(data, MultiIndex):
Expand Down
9 changes: 2 additions & 7 deletions pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,14 +618,9 @@ def decode(obj):
return Interval(obj['left'], obj['right'], obj['closed'])
elif typ == 'series':
dtype = dtype_for(obj['dtype'])
pd_dtype = pandas_dtype(dtype)

index = obj['index']
result = Series(unconvert(obj['data'], dtype, obj['compress']),
index=index,
dtype=pd_dtype,
name=obj['name'])
return result
data = unconvert(obj['data'], dtype, obj['compress'])
return Series(data, index=index, dtype=dtype, name=obj['name'])

elif typ == 'block_manager':
axes = obj['axes']
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
expected = cat
tm.assert_categorical_equal(result, expected)

def test_astype_category_ordered_none_deprecated(self):
# GH 26336
cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True)
cdt2 = CategoricalDtype(categories=list('cedafb'))
cat = Categorical(list('abcdaba'), dtype=cdt1)
with tm.assert_produces_warning(FutureWarning):
cat.astype(cdt2)

def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
Expand Down
32 changes: 25 additions & 7 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype,
is_period, is_period_dtype, is_string_dtype)
from pandas.core.dtypes.dtypes import (
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry)
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry,
sentinel)

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -54,7 +55,8 @@ def test_pickle(self):
class TestCategoricalDtype(Base):

def create(self):
return CategoricalDtype()
# TODO(GH 26403): Remove when default ordered becomes False
return CategoricalDtype(ordered=None)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

def test_pickle(self):
# make sure our cache is NOT pickled
Expand Down Expand Up @@ -675,7 +677,8 @@ def test_unordered_same(self, ordered):
def test_categories(self):
result = CategoricalDtype(['a', 'b', 'c'])
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
assert result.ordered is None
with tm.assert_produces_warning(FutureWarning):
assert result.ordered is None

def test_equal_but_different(self, ordered_fixture):
c1 = CategoricalDtype([1, 2, 3])
Expand Down Expand Up @@ -804,7 +807,7 @@ def test_categorical_categories(self):

@pytest.mark.parametrize('new_categories', [
list('abc'), list('cba'), list('wxyz'), None])
@pytest.mark.parametrize('new_ordered', [True, False, None])
@pytest.mark.parametrize('new_ordered', [True, False, None, sentinel])
def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
dtype = CategoricalDtype(list('abc'), ordered_fixture)
new_dtype = CategoricalDtype(new_categories, new_ordered)
Expand All @@ -813,11 +816,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
if expected_categories is None:
expected_categories = dtype.categories

expected_ordered = new_dtype.ordered
if expected_ordered is None:
expected_ordered = new_ordered
if new_ordered is sentinel or new_ordered is None:
expected_ordered = dtype.ordered

result = dtype.update_dtype(new_dtype)
# GH 26336
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you give some explanation on what you are testing here (the cases)

if new_ordered is sentinel and ordered_fixture is True:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = dtype.update_dtype(new_dtype)
else:
result = dtype.update_dtype(new_dtype)

tm.assert_index_equal(result.categories, expected_categories)
assert result.ordered is expected_ordered

Expand All @@ -837,6 +847,14 @@ def test_update_dtype_errors(self, bad_dtype):
with pytest.raises(ValueError, match=msg):
dtype.update_dtype(bad_dtype)

@pytest.mark.parametrize('ordered', [sentinel, None, True, False])
def test_ordered_none_default_deprecated(self, ordered):
# GH 26403: CDT.ordered only warns if ordered is not explicitly passed
dtype = CategoricalDtype(list('abc'), ordered=ordered)
warning = FutureWarning if ordered is sentinel else None
with tm.assert_produces_warning(warning):
dtype.ordered


@pytest.mark.parametrize('dtype', [
CategoricalDtype,
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from pandas._libs import index as libindex

from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.dtypes import CategoricalDtype, sentinel

import pandas as pd
from pandas import Categorical, IntervalIndex
Expand Down Expand Up @@ -490,6 +490,16 @@ def test_astype_category(self, name, dtype_ordered, index_ordered):
expected = index
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('none, warning', [
(None, None), (sentinel, FutureWarning)])
def test_astype_category_ordered_none_deprecated(self, none, warning):
# GH 26336: only warn if None is not explicitly passed
cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True)
cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none)
idx = CategoricalIndex(list('abcdaba'), dtype=cdt1)
with tm.assert_produces_warning(warning):
idx.astype(cdt2)

def test_reindex_base(self):
# Determined by cat ordering.
idx = CategoricalIndex(list("cab"), categories=list("cab"))
Expand Down
31 changes: 30 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@

from pandas.core.dtypes.common import (
is_categorical_dtype, is_datetime64tz_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype, sentinel

import pandas as pd
from pandas import (
Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series,
Timestamp, date_range, isna, period_range, timedelta_range)
from pandas.api.types import CategoricalDtype
from pandas.core.arrays import period_array
import pandas.util.testing as tm
from pandas.util.testing import assert_series_equal
Expand Down Expand Up @@ -372,6 +372,35 @@ def test_constructor_categorical_dtype(self):
dtype=CategoricalDtype(['a', 'b'], ordered=True))
tm.assert_series_equal(result, expected, check_categorical=True)

def test_constructor_categorical_string(self):
# GH 26336: the string 'category' maintains existing CategoricalDtype
cdt = CategoricalDtype(categories=list('dabc'), ordered=True)
expected = Series(list('abcabc'), dtype=cdt)

# Series(Categorical, dtype='category') keeps existing dtype
cat = Categorical(list('abcabc'), dtype=cdt)
result = Series(cat, dtype='category')
tm.assert_series_equal(result, expected)

# Series(Series[Categorical], dtype='category') keeps existing dtype
result = Series(result, dtype='category')
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('none, warning', [
(None, None), (sentinel, FutureWarning)])
def test_categorical_ordered_none_deprecated(self, none, warning):
# GH 26336: only warn if None is not explicitly passed
cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True)
cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none)

cat = Categorical(list('abcdaba'), dtype=cdt1)
with tm.assert_produces_warning(warning, check_stacklevel=False):
Series(cat, dtype=cdt2)

s = Series(cat)
with tm.assert_produces_warning(warning, check_stacklevel=False):
Series(s, dtype=cdt2)

def test_categorical_sideeffects_free(self):
# Passing a categorical to a Series and then changing values in either
# the series or the categorical should not change the values in the
Expand Down
Loading