Skip to content

Commit

Permalink
BUG: Fix Series.astype and Categorical.astype to update existing Cate…
Browse files Browse the repository at this point in the history
…gorical data (#18710)
  • Loading branch information
jschendel authored and jreback committed Dec 13, 2017
1 parent 040470a commit 73e73db
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 38 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ Conversion
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`)
- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)


Indexing
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,12 @@ def astype(self, dtype, copy=True):
"""
if is_categorical_dtype(dtype):
if copy is True:
return self.copy()
return self
# GH 10696/18593
dtype = self.dtype._update_dtype(dtype)
self = self.copy() if copy else self
if dtype == self.dtype:
return self
return self._set_dtype(dtype)
return np.array(self, dtype=dtype, copy=copy)

@cache_readonly
Expand Down
35 changes: 10 additions & 25 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
import pandas.core.dtypes.concat as _concat

from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex
from pandas.core.common import is_null_slice
from pandas.core.common import is_null_slice, _any_not_none
import pandas.core.algorithms as algos

from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -573,7 +573,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
raise TypeError(msg)

# may need to convert to categorical
# this is only called for non-categoricals
if self.is_categorical_astype(dtype):

# deprecated 17636
Expand All @@ -589,13 +588,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
"CategoricalDtype instead",
FutureWarning, stacklevel=7)

kwargs = kwargs.copy()
categories = getattr(dtype, 'categories', None)
ordered = getattr(dtype, 'ordered', False)
categories = kwargs.get('categories', None)
ordered = kwargs.get('ordered', None)
if _any_not_none(categories, ordered):
dtype = CategoricalDtype(categories, ordered)

kwargs.setdefault('categories', categories)
kwargs.setdefault('ordered', ordered)
return self.make_block(Categorical(self.values, **kwargs))
if is_categorical_dtype(self.values):
# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))

return self.make_block(Categorical(self.values, dtype=dtype))

# astype processing
dtype = np.dtype(dtype)
Expand Down Expand Up @@ -2427,23 +2429,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):

return self.make_block_same_class(new_values, new_mgr_locs)

def _astype(self, dtype, copy=False, errors='raise', values=None,
klass=None, mgr=None):
"""
Coerce to the new type (if copy=True, return a new copy)
raise on an except if raise == True
"""

if self.is_categorical_astype(dtype):
values = self.values
else:
values = np.asarray(self.values).astype(dtype, copy=False)

if copy:
values = values.copy()

return self.make_block(values)

def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

Expand Down
54 changes: 49 additions & 5 deletions pandas/tests/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,54 @@ def test_codes_dtypes(self):
result = result.remove_categories(['foo%05d' % i for i in range(300)])
assert result.codes.dtype == 'int8'

def test_astype_categorical(self):
@pytest.mark.parametrize('ordered', [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list('abbaaccc'), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)

msg = 'could not convert string to float'
with tm.assert_raises_regex(ValueError, msg):
cat.astype(float)

# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)

result = cat.astype(int)
expected = np.array(cat, dtype=np.int)
tm.assert_numpy_array_equal(result, expected)

result = cat.astype(float)
expected = np.array(cat, dtype=np.float)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('cat_ordered', [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH 10696/18593
data = list('abcaacbab')
cat = Categorical(data, categories=list('bac'), ordered=cat_ordered)

# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(
data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)

cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
tm.assert_categorical_equal(cat, cat.astype('category'))
tm.assert_almost_equal(np.array(cat), cat.astype('object'))
# non-standard categories
dtype = CategoricalDtype(list('adc'), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)

pytest.raises(ValueError, lambda: cat.astype(float))
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype('category')
expected = cat
tm.assert_categorical_equal(result, expected)
9 changes: 4 additions & 5 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,19 +411,18 @@ def test_astype(self):
result = IntervalIndex.from_intervals(result.values)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('copy', [True, False])
@pytest.mark.parametrize('name', [None, 'foo'])
@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('index_ordered', [True, False])
def test_astype_category(self, copy, name, dtype_ordered, index_ordered):
def test_astype_category(self, name, dtype_ordered, index_ordered):
# GH 18630
index = self.create_index(ordered=index_ordered)
if name:
index = index.rename(name)

# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = index.astype(dtype, copy=copy)
result = index.astype(dtype)
expected = CategoricalIndex(index.tolist(),
name=name,
categories=index.categories,
Expand All @@ -432,13 +431,13 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered):

# non-standard categories
dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
result = index.astype(dtype, copy=copy)
result = index.astype(dtype)
expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
tm.assert_index_equal(result, expected)

if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = index.astype('category', copy=copy)
result = index.astype('category')
expected = index
tm.assert_index_equal(result, expected)

Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,44 @@ def cmp(a, b):
lambda x: x.astype('object').astype(Categorical)]:
pytest.raises(TypeError, lambda: invalid(s))

@pytest.mark.parametrize('name', [None, 'foo'])
@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('series_ordered', [True, False])
def test_astype_categorical_to_categorical(self, name, dtype_ordered,
series_ordered):
# GH 10696/18593
s_data = list('abcaacbab')
s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
s = Series(s_data, dtype=s_dtype, name=name)

# unspecified categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = s.astype(dtype)
exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
expected = Series(s_data, name=name, dtype=exp_dtype)
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = s.astype('category', ordered=dtype_ordered)
tm.assert_series_equal(result, expected)

# different categories
dtype = CategoricalDtype(list('adc'), dtype_ordered)
result = s.astype(dtype)
expected = Series(s_data, name=name, dtype=dtype)
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = s.astype(
'category', categories=list('adc'), ordered=dtype_ordered)
tm.assert_series_equal(result, expected)

if dtype_ordered is False:
# not specifying ordered, so only test once
expected = s
result = s.astype('category')
tm.assert_series_equal(result, expected)

def test_astype_categoricaldtype(self):
s = Series(['a', 'b', 'a'])
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
Expand Down

0 comments on commit 73e73db

Please sign in to comment.