Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add support dtype='category' in Series constructor #8075

Merged
merged 2 commits into from
Aug 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ API changes
3 9
4 NaN
dtype: float64

New behavior (note final value is ``7 = sum([3, 4, NaN])``):

.. ipython:: python
Expand Down Expand Up @@ -346,7 +346,7 @@ Categoricals in Series/DataFrame

:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`).
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`).

For full docs, see the :ref:`Categorical introduction <categorical>` and the
:ref:`API documentation <api.categorical>`.
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,12 +743,14 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
name=self.name, fastpath=True)

def take_nd(self, indexer, allow_fill=True, fill_value=None):
""" Take the values by the indexer, fill with the fill_value. """
if allow_fill and fill_value is None:
fill_value = np.nan
""" Take the codes by the indexer, fill with the fill_value. """

# filling must always be None/nan here
# but is passed thru internally
assert isnull(fill_value)

values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
result = Categorical(codes, levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)
return result

Expand Down
18 changes: 15 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2326,19 +2326,31 @@ def is_number(obj):
return isinstance(obj, (numbers.Number, np.number))


def _coerce_to_dtype(dtype):
""" coerce a string / np.dtype to a dtype """
if is_categorical_dtype(dtype):
dtype = CategoricalDtype()
else:
dtype = np.dtype(dtype)
return dtype

def _get_dtype(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
return arr_or_dtype
if isinstance(arr_or_dtype, type):
elif isinstance(arr_or_dtype, type):
return np.dtype(arr_or_dtype)
elif isinstance(arr_or_dtype, CategoricalDtype):
return CategoricalDtype()
return arr_or_dtype.dtype


def _get_dtype_type(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
return arr_or_dtype.type
if isinstance(arr_or_dtype, type):
elif isinstance(arr_or_dtype, type):
return np.dtype(arr_or_dtype).type
elif isinstance(arr_or_dtype, CategoricalDtype):
return CategoricalDtypeType
return arr_or_dtype.dtype.type


Expand Down Expand Up @@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True):
""" return a view if copy is False, but
need to be very careful as the result shape could change! """
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
dtype = _coerce_to_dtype(dtype)

if is_datetime64_dtype(arr):
if dtype == object:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _validate_dtype(self, dtype):
""" validate the passed dtype """

if dtype is not None:
dtype = np.dtype(dtype)
dtype = com._coerce_to_dtype(dtype)

# a compound dtype
if dtype.kind == 'V':
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
is_list_like, _values_from_object,
_possibly_cast_to_datetime, _possibly_castable,
_possibly_convert_platform, _try_sort,
ABCSparseArray, _maybe_match_name,
ABCSparseArray, _maybe_match_name, _coerce_to_dtype,
_ensure_object, SettingWithCopyError)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
_ensure_index)
Expand Down Expand Up @@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False,
""" sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """

if dtype is not None:
dtype = np.dtype(dtype)
dtype = _coerce_to_dtype(dtype)

if isinstance(data, ma.MaskedArray):
mask = ma.getmaskarray(data)
Expand All @@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path):
arr = _possibly_cast_to_datetime(arr, dtype)
subarr = pa.array(arr, dtype=dtype, copy=copy)
except (ValueError, TypeError):
if dtype is not None and raise_cast_failure:
if com.is_categorical_dtype(dtype):
subarr = Categorical(arr)
elif dtype is not None and raise_cast_failure:
raise
else: # pragma: no cover
else:
subarr = pa.array(arr, dtype=object, copy=copy)
return subarr

Expand Down
47 changes: 46 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,13 +840,58 @@ def test_creation_astype(self):
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)


df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)

def test_construction_series(self):

l = [1,2,3,1]
exp = Series(l).astype('category')
res = Series(l,dtype='category')
tm.assert_series_equal(res, exp)

l = ["a","b","c","a"]
exp = Series(l).astype('category')
res = Series(l,dtype='category')
tm.assert_series_equal(res, exp)

# insert into frame with different index
# GH 8076
index = pd.date_range('20000101', periods=3)
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
expected.index = index

expected = DataFrame({'x': expected})
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
tm.assert_frame_equal(df, expected)

def test_reindex(self):

index = pd.date_range('20000101', periods=3)

# reindexing to an invalid Categorical
s = Series(['a', 'b', 'c'],dtype='category')
result = s.reindex(index)
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
expected.index = index
tm.assert_series_equal(result, expected)

# partial reindexing
expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c']))
expected.index = [1,2]
result = s.reindex([1,2])
tm.assert_series_equal(result, expected)

expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c']))
expected.index = [2,3]
result = s.reindex([2,3])
tm.assert_series_equal(result, expected)



def test_sideeffects_free(self):

# Passing a categorical to a Series and then changing values in either the series or the
Expand Down