Skip to content

Commit

Permalink
BUG: union_categoricals can't handle NaN
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Jul 22, 2016
1 parent 9f94e6a commit 4312a32
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 4 deletions.
53 changes: 53 additions & 0 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,59 @@ def test_union_categorical(self):
with tm.assertRaises(ValueError):
union_categoricals([])

def test_union_categoricals_nan(self):
# GH 13759
res = union_categoricals([pd.Categorical([1, 2, np.nan]),
pd.Categorical([3, 2, np.nan])])
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
tm.assert_categorical_equal(res, exp)

res = union_categoricals([pd.Categorical(['A', 'B']),
pd.Categorical(['B', 'B', np.nan])])
exp = Categorical(['A', 'B', 'B', 'B', np.nan])
tm.assert_categorical_equal(res, exp)

val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
pd.NaT]
val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-02-01')]

res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
exp = Categorical(val1 + val2,
categories=[pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-03-01'),
pd.Timestamp('2011-02-01')])
tm.assert_categorical_equal(res, exp)

# all NaN
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
pd.Categorical(['X'])])
exp = Categorical([np.nan, np.nan, 'X'])
tm.assert_categorical_equal(res, exp)

res = union_categoricals([pd.Categorical([np.nan, np.nan]),
pd.Categorical([np.nan, np.nan])])
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
tm.assert_categorical_equal(res, exp)

def test_union_categoricals_empty(self):
# GH 13759
res = union_categoricals([pd.Categorical([]),
pd.Categorical([])])
exp = Categorical([])
tm.assert_categorical_equal(res, exp)

res = union_categoricals([pd.Categorical([]),
pd.Categorical([1.0])])
exp = Categorical([1.0])
tm.assert_categorical_equal(res, exp)

# to make dtype equal
nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
res = union_categoricals([nanc,
pd.Categorical([])])
tm.assert_categorical_equal(res, nanc)

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
14 changes: 10 additions & 4 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas.tslib as tslib
from pandas import compat
from pandas.compat import map
from pandas.core.algorithms import take_1d
from .common import (is_categorical_dtype,
is_sparse,
is_datetimetz,
Expand Down Expand Up @@ -254,10 +255,15 @@ def union_categoricals(to_union):

new_codes = []
for c in to_union:
indexer = categories.get_indexer(c.categories)
new_codes.append(indexer.take(c.codes))
codes = np.concatenate(new_codes)
return Categorical(codes, categories=categories, ordered=False,
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
else:
# must be all NaN
new_codes.append(c.codes)

new_codes = np.concatenate(new_codes)
return Categorical(new_codes, categories=categories, ordered=False,
fastpath=True)


Expand Down

0 comments on commit 4312a32

Please sign in to comment.