-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DEPR: Deprecate ordered=None for CategoricalDtype #26403
Changes from 9 commits
adc0bca
4ad16c9
2da967d
b03eadd
77f171d
cf12a1f
060eb08
c9c2333
bc16e98
99b6a30
d999bd8
fdb5770
c45f159
219f03c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,10 @@ | |
|
||
str_type = str | ||
|
||
# sentinel value used for the default value of ordered in the CategoricalDtype | ||
# constructor to detect when ordered=None is explicitly passed (GH 26403) | ||
sentinel = object() # type: object | ||
jschendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def register_extension_dtype(cls: Type[ExtensionDtype], | ||
) -> Type[ExtensionDtype]: | ||
|
@@ -214,7 +218,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): | |
_metadata = ('categories', 'ordered') | ||
_cache = {} # type: Dict[str_type, PandasExtensionDtype] | ||
|
||
def __init__(self, categories=None, ordered: Optional[bool] = None): | ||
def __init__(self, | ||
categories=None, | ||
ordered: Union[None, bool, object] = sentinel): | ||
self._finalize(categories, ordered, fastpath=False) | ||
|
||
@classmethod | ||
|
@@ -334,15 +340,16 @@ def _finalize(self, | |
fastpath: bool = False, | ||
) -> None: | ||
|
||
if ordered is not None: | ||
if ordered is not None and ordered is not sentinel: | ||
self.validate_ordered(ordered) | ||
|
||
if categories is not None: | ||
categories = self.validate_categories(categories, | ||
fastpath=fastpath) | ||
|
||
self._categories = categories | ||
self._ordered = ordered | ||
self._ordered = ordered if ordered is not sentinel else None | ||
self._ordered_from_sentinel = ordered is sentinel | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a little bit gross but seemed to be the cleanest way to implement things in a non-intrusive way (i.e. only warning when behavior would silently change). Internal code can't reference I tried to change the internals so that Note that the I'll volunteer to remove this and revert There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Showtime! |
||
|
||
def __setstate__(self, state: Dict[str_type, Any]) -> None: | ||
# for pickle compat. __get_state__ is defined in the | ||
|
@@ -355,12 +362,12 @@ def __hash__(self) -> int: | |
# _hash_categories returns a uint64, so use the negative | ||
# space for when we have unknown categories to avoid a conflict | ||
if self.categories is None: | ||
if self.ordered: | ||
if self._ordered: | ||
return -1 | ||
else: | ||
return -2 | ||
# We *do* want to include the real self.ordered here | ||
return int(self._hash_categories(self.categories, self.ordered)) | ||
return int(self._hash_categories(self.categories, self._ordered)) | ||
|
||
def __eq__(self, other: Any) -> bool: | ||
""" | ||
|
@@ -379,7 +386,7 @@ def __eq__(self, other: Any) -> bool: | |
return other == self.name | ||
elif other is self: | ||
return True | ||
elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')): | ||
elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')): | ||
return False | ||
elif self.categories is None or other.categories is None: | ||
# We're forced into a suboptimal corner thanks to math and | ||
|
@@ -388,10 +395,10 @@ def __eq__(self, other: Any) -> bool: | |
# CDT(., .) = CDT(None, False) and *all* | ||
# CDT(., .) = CDT(None, True). | ||
return True | ||
elif self.ordered or other.ordered: | ||
elif self._ordered or other._ordered: | ||
# At least one has ordered=True; equal if both have ordered=True | ||
# and the same values for categories in the same order. | ||
return ((self.ordered == other.ordered) and | ||
return ((self._ordered == other._ordered) and | ||
self.categories.equals(other.categories)) | ||
else: | ||
# Neither has ordered=True; equal if both have the same categories, | ||
|
@@ -406,7 +413,7 @@ def __repr__(self): | |
data = "None, " | ||
else: | ||
data = self.categories._format_data(name=self.__class__.__name__) | ||
return tpl.format(data, self.ordered) | ||
return tpl.format(data, self._ordered) | ||
|
||
@staticmethod | ||
def _hash_categories(categories, ordered: Optional[bool] = True) -> int: | ||
|
@@ -534,17 +541,24 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': | |
msg = ('a CategoricalDtype must be passed to perform an update, ' | ||
'got {dtype!r}').format(dtype=dtype) | ||
raise ValueError(msg) | ||
elif dtype.categories is not None and dtype.ordered is self.ordered: | ||
return dtype | ||
|
||
# dtype is CDT: keep current categories/ordered if None | ||
new_categories = dtype.categories | ||
if new_categories is None: | ||
new_categories = self.categories | ||
|
||
new_ordered = dtype.ordered | ||
new_ordered = dtype._ordered | ||
new_ordered_from_sentinel = dtype._ordered_from_sentinel | ||
if new_ordered is None: | ||
new_ordered = self.ordered | ||
# maintain existing ordered if new dtype has ordered=None | ||
new_ordered = self._ordered | ||
if self._ordered and new_ordered_from_sentinel: | ||
# only warn if we'd actually change the existing behavior | ||
msg = ("Constructing a CategoricalDtype without specifying " | ||
"`ordered` will default to `ordered=False` in a future " | ||
"version; `ordered=True` must be explicitly passed in " | ||
"order to be retained") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we explain what the use case is here? (maybe we don't really want to encourage using it ..) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added some more detail to the message, but agree that we don't really want to encourage using |
||
warnings.warn(msg, FutureWarning, stacklevel=3) | ||
|
||
return CategoricalDtype(new_categories, new_ordered) | ||
|
||
|
@@ -560,6 +574,14 @@ def ordered(self) -> Optional[bool]: | |
""" | ||
Whether the categories have an ordered relationship. | ||
""" | ||
# TODO: remove if block when ordered=None as default is deprecated | ||
if self._ordered_from_sentinel and self._ordered is None: | ||
# warn when accessing ordered if ordered=None and None was not | ||
# explicitly passed to the constructor | ||
msg = ("Constructing a CategoricalDtype without specifying " | ||
"`ordered` will default to `ordered=False` in a future " | ||
"version; `ordered=None` must be explicitly passed.") | ||
warnings.warn(msg, FutureWarning, stacklevel=2) | ||
return self._ordered | ||
|
||
@property | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,8 @@ | |
is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype, | ||
is_period, is_period_dtype, is_string_dtype) | ||
from pandas.core.dtypes.dtypes import ( | ||
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry) | ||
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry, | ||
sentinel) | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
|
@@ -54,7 +55,8 @@ def test_pickle(self): | |
class TestCategoricalDtype(Base): | ||
|
||
def create(self): | ||
return CategoricalDtype() | ||
# TODO(GH 26403): Remove when default ordered becomes False | ||
return CategoricalDtype(ordered=None) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def test_pickle(self): | ||
# make sure our cache is NOT pickled | ||
|
@@ -675,7 +677,8 @@ def test_unordered_same(self, ordered): | |
def test_categories(self): | ||
result = CategoricalDtype(['a', 'b', 'c']) | ||
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) | ||
assert result.ordered is None | ||
with tm.assert_produces_warning(FutureWarning): | ||
assert result.ordered is None | ||
|
||
def test_equal_but_different(self, ordered_fixture): | ||
c1 = CategoricalDtype([1, 2, 3]) | ||
|
@@ -804,7 +807,7 @@ def test_categorical_categories(self): | |
|
||
@pytest.mark.parametrize('new_categories', [ | ||
list('abc'), list('cba'), list('wxyz'), None]) | ||
@pytest.mark.parametrize('new_ordered', [True, False, None]) | ||
@pytest.mark.parametrize('new_ordered', [True, False, None, sentinel]) | ||
def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): | ||
dtype = CategoricalDtype(list('abc'), ordered_fixture) | ||
new_dtype = CategoricalDtype(new_categories, new_ordered) | ||
|
@@ -813,11 +816,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): | |
if expected_categories is None: | ||
expected_categories = dtype.categories | ||
|
||
expected_ordered = new_dtype.ordered | ||
if expected_ordered is None: | ||
expected_ordered = new_ordered | ||
if new_ordered is sentinel or new_ordered is None: | ||
expected_ordered = dtype.ordered | ||
|
||
result = dtype.update_dtype(new_dtype) | ||
# GH 26336 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you give some explanation on what you are testing here (the cases) |
||
if new_ordered is sentinel and ordered_fixture is True: | ||
with tm.assert_produces_warning(FutureWarning, | ||
check_stacklevel=False): | ||
result = dtype.update_dtype(new_dtype) | ||
else: | ||
result = dtype.update_dtype(new_dtype) | ||
|
||
tm.assert_index_equal(result.categories, expected_categories) | ||
assert result.ordered is expected_ordered | ||
|
||
|
@@ -837,6 +847,14 @@ def test_update_dtype_errors(self, bad_dtype): | |
with pytest.raises(ValueError, match=msg): | ||
dtype.update_dtype(bad_dtype) | ||
|
||
@pytest.mark.parametrize('ordered', [sentinel, None, True, False]) | ||
def test_ordered_none_default_deprecated(self, ordered): | ||
# GH 26403: CDT.ordered only warns if ordered is not explicitly passed | ||
dtype = CategoricalDtype(list('abc'), ordered=ordered) | ||
warning = FutureWarning if ordered is sentinel else None | ||
with tm.assert_produces_warning(warning): | ||
dtype.ordered | ||
|
||
|
||
@pytest.mark.parametrize('dtype', [ | ||
CategoricalDtype, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we really need to user the internal one here? (and all others); I really don't want to expose that even to our code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See the overview that Jeremy gave above (#26403 (comment)) and this comment for more details on why
_ordered
was needed: #26403 (comment)That explains clearly the context on why this was added, and I am fine with it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok that's fine then, @jschendel can you create an issue for this so we don't forget at removal time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure, back at work today so will create the issue later on tonight