Skip to content

Commit

Permalink
Categorical: fix describe with np.nan
Browse files Browse the repository at this point in the history
  • Loading branch information
jankatins committed Jul 16, 2014
1 parent 704c505 commit 1334684
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
19 changes: 16 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,12 +931,25 @@ def describe(self):
'values' : self._codes }
).groupby('codes').count()

counts.index = self.levels.take(counts.index)
counts = counts.reindex(self.levels)
freqs = counts / float(counts.sum())

from pandas.tools.merge import concat
result = concat([counts,freqs],axis=1)
result.index.name = 'levels'
result.columns = ['counts','freqs']

# Up to now we have codes -> fill in the levels
# object in case we need to handle NaNs
levels = np.asarray(self.levels, dtype=object)
# use arange to also include not used levels
index = np.arange(0, len(levels))
# handle nan
if -1 in result.index:
# take[...,-1] returns the last element. So put np.nan there...
levels = np.append(levels, np.nan)
# also sort the -1 to the last position in the index
index = np.append(index, -1)
result = result.reindex(index)
result.index = levels.take(result.index)
result.index.name = 'levels'

return result
33 changes: 33 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,16 @@ def test_describe(self):
).set_index('levels')
tm.assert_frame_equal(desc, expected)

# check unused levels
cat = self.factor.copy()
cat.levels = ["a","b","c","d"]
desc = cat.describe()
expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan],
freqs=[3/8., 2/8., 3/8., np.nan],
levels=['a', 'b', 'c', 'd'])
).set_index('levels')
tm.assert_frame_equal(desc, expected)

# check an integer one
desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe()
expected = DataFrame.from_dict(dict(counts=[5, 3, 3],
Expand All @@ -226,6 +236,29 @@ def test_describe(self):
).set_index('levels')
tm.assert_frame_equal(desc, expected)

# https://github.com/pydata/pandas/issues/3678
# describe should work with NaN
cat = pd.Categorical([np.nan,1, 2, 2])
desc = cat.describe()
expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
freqs=[1/4., 2/4., 1/4.],
levels=[1,2,np.nan]
)
).set_index('levels')
tm.assert_frame_equal(desc, expected)

# having NaN as level and as "not available" should also print two NaNs in describe!
cat = pd.Categorical([np.nan,1, 2, 2])
cat.levels = [1,2,np.nan]
desc = cat.describe()
expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
freqs=[1/4., 2/4., np.nan, 1/4.],
levels=[1,2,np.nan,np.nan]
)
).set_index('levels')
tm.assert_frame_equal(desc, expected)


def test_print(self):
expected = [" a", " b", " b", " a", " a", " c", " c", " c",
"Levels (3, object): [a < b < c]"]
Expand Down

0 comments on commit 1334684

Please sign in to comment.