Skip to content

Commit

Permalink
Merge pull request #8075 from jreback/cat2
Browse files Browse the repository at this point in the history
ENH: add support dtype='category' in Series constructor
  • Loading branch information
jreback committed Aug 21, 2014
2 parents 8dc3c19 + 9159d98 commit aa5e55e
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 16 deletions.
4 changes: 2 additions & 2 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ API changes
3 9
4 NaN
dtype: float64

New behavior (note final value is ``7 = sum([3, 4, NaN])``):

.. ipython:: python
Expand Down Expand Up @@ -346,7 +346,7 @@ Categoricals in Series/DataFrame

:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`).
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`).

For full docs, see the :ref:`Categorical introduction <categorical>` and the
:ref:`API documentation <api.categorical>`.
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,12 +743,14 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
name=self.name, fastpath=True)

def take_nd(self, indexer, allow_fill=True, fill_value=None):
""" Take the values by the indexer, fill with the fill_value. """
if allow_fill and fill_value is None:
fill_value = np.nan
""" Take the codes by the indexer, fill with the fill_value. """

# filling must always be None/nan here
# but is passed thru internally
assert isnull(fill_value)

values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
result = Categorical(codes, levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)
return result

Expand Down
18 changes: 15 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2326,19 +2326,31 @@ def is_number(obj):
return isinstance(obj, (numbers.Number, np.number))


def _coerce_to_dtype(dtype):
""" coerce a string / np.dtype to a dtype """
if is_categorical_dtype(dtype):
dtype = CategoricalDtype()
else:
dtype = np.dtype(dtype)
return dtype

def _get_dtype(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
return arr_or_dtype
if isinstance(arr_or_dtype, type):
elif isinstance(arr_or_dtype, type):
return np.dtype(arr_or_dtype)
elif isinstance(arr_or_dtype, CategoricalDtype):
return CategoricalDtype()
return arr_or_dtype.dtype


def _get_dtype_type(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
return arr_or_dtype.type
if isinstance(arr_or_dtype, type):
elif isinstance(arr_or_dtype, type):
return np.dtype(arr_or_dtype).type
elif isinstance(arr_or_dtype, CategoricalDtype):
return CategoricalDtypeType
return arr_or_dtype.dtype.type


Expand Down Expand Up @@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True):
""" return a view if copy is False, but
need to be very careful as the result shape could change! """
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
dtype = _coerce_to_dtype(dtype)

if is_datetime64_dtype(arr):
if dtype == object:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _validate_dtype(self, dtype):
""" validate the passed dtype """

if dtype is not None:
dtype = np.dtype(dtype)
dtype = com._coerce_to_dtype(dtype)

# a compound dtype
if dtype.kind == 'V':
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
is_list_like, _values_from_object,
_possibly_cast_to_datetime, _possibly_castable,
_possibly_convert_platform, _try_sort,
ABCSparseArray, _maybe_match_name,
ABCSparseArray, _maybe_match_name, _coerce_to_dtype,
_ensure_object, SettingWithCopyError)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
_ensure_index)
Expand Down Expand Up @@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False,
""" sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """

if dtype is not None:
dtype = np.dtype(dtype)
dtype = _coerce_to_dtype(dtype)

if isinstance(data, ma.MaskedArray):
mask = ma.getmaskarray(data)
Expand All @@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path):
arr = _possibly_cast_to_datetime(arr, dtype)
subarr = pa.array(arr, dtype=dtype, copy=copy)
except (ValueError, TypeError):
if dtype is not None and raise_cast_failure:
if com.is_categorical_dtype(dtype):
subarr = Categorical(arr)
elif dtype is not None and raise_cast_failure:
raise
else: # pragma: no cover
else:
subarr = pa.array(arr, dtype=object, copy=copy)
return subarr

Expand Down
47 changes: 46 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,13 +840,58 @@ def test_creation_astype(self):
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)


df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)

def test_construction_series(self):

l = [1,2,3,1]
exp = Series(l).astype('category')
res = Series(l,dtype='category')
tm.assert_series_equal(res, exp)

l = ["a","b","c","a"]
exp = Series(l).astype('category')
res = Series(l,dtype='category')
tm.assert_series_equal(res, exp)

# insert into frame with different index
# GH 8076
index = pd.date_range('20000101', periods=3)
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
expected.index = index

expected = DataFrame({'x': expected})
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
tm.assert_frame_equal(df, expected)

def test_reindex(self):

index = pd.date_range('20000101', periods=3)

# reindexing to an invalid Categorical
s = Series(['a', 'b', 'c'],dtype='category')
result = s.reindex(index)
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
expected.index = index
tm.assert_series_equal(result, expected)

# partial reindexing
expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c']))
expected.index = [1,2]
result = s.reindex([1,2])
tm.assert_series_equal(result, expected)

expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c']))
expected.index = [2,3]
result = s.reindex([2,3])
tm.assert_series_equal(result, expected)



def test_sideeffects_free(self):

# Passing a categorical to a Series and then changing values in either the series or the
Expand Down

0 comments on commit aa5e55e

Please sign in to comment.