Skip to content

Commit

Permalink
Merge pull request #7000 from jreback/groupby_counts_agg
Browse files Browse the repository at this point in the history
ENH/BUG: add count to grouper / ensure that grouper keys are not included in the returned
  • Loading branch information
jreback committed Apr 29, 2014
2 parents 97c4a2e + 134dd1f commit d2ead2c
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 63 deletions.
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ API Changes
validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
- Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
``data`` argument (:issue:`5357`)
- groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
as its already the index

Deprecations
~~~~~~~~~~~~
Expand Down
19 changes: 18 additions & 1 deletion doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,29 @@ API changes

.. ipython:: python

DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0) # can also use negative ints

g.nth(0, dropna='any') # similar to old behaviour

groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
as its already the index

.. ipython:: python

df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
g = df.groupby('A')
g.count()
g.describe()

passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0)

df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
g = df.groupby('A',as_index=False)
g.count()
g.describe()

- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping
by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)

Expand Down
10 changes: 6 additions & 4 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,13 @@ def describe(self):
"""
# Hack?
from pandas.core.frame import DataFrame
grouped = DataFrame(self.labels).groupby(0)
counts = grouped.count().values.squeeze()
counts = DataFrame({
'labels' : self.labels,
'values' : self.labels }
).groupby('labels').count().squeeze().values
freqs = counts / float(counts.sum())
return DataFrame.from_dict({
return DataFrame({
'counts': counts,
'freqs': freqs,
'levels': self.levels
}).set_index('levels')
}).set_index('levels')
22 changes: 13 additions & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,11 +611,19 @@ def __neg__(self):
arr = operator.inv(values)
else:
arr = operator.neg(values)
return self._wrap_array(arr, self.axes, copy=False)
return self.__array_wrap__(arr)

def __invert__(self):
arr = operator.inv(_values_from_object(self))
return self._wrap_array(arr, self.axes, copy=False)
try:
arr = operator.inv(_values_from_object(self))
return self.__array_wrap__(arr)
except:

# inv fails with 0 len
if not np.prod(self.shape):
return self

raise

def equals(self, other):
"""
Expand Down Expand Up @@ -707,15 +715,11 @@ def __abs__(self):
#----------------------------------------------------------------------
# Array Interface

def _wrap_array(self, arr, axes, copy=False):
d = self._construct_axes_dict_from(self, axes, copy=copy)
return self._constructor(arr, **d).__finalize__(self)

def __array__(self, dtype=None):
return _values_from_object(self)

def __array_wrap__(self, result):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
def __array_wrap__(self, result, copy=False):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=copy)
return self._constructor(result, **d).__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
Expand Down
107 changes: 78 additions & 29 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,23 @@ def _selection_list(self):
return [self._selection]
return self._selection

@cache_readonly
def _selected_obj(self):

if self._selection is None or isinstance(self.obj, Series):
return self.obj
else:
return self.obj[self._selection]

def _set_selection_from_grouper(self):
""" we may need create a selection if we have non-level groupers """
grp = self.grouper
if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None:
ax = self.obj._info_axis
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
if len(groupers):
self._selection = (ax-Index(groupers)).tolist()

def _local_dir(self):
return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))

Expand All @@ -453,7 +470,6 @@ def __getattr__(self, attr):
return object.__getattribute__(self, attr)
if attr in self.obj:
return self[attr]

if hasattr(self.obj, attr):
return self._make_wrapper(attr)

Expand All @@ -472,6 +488,10 @@ def _make_wrapper(self, name):
type(self).__name__))
raise AttributeError(msg)

# need to setup the selection
# as are not passed directly but in the grouper
self._set_selection_from_grouper()

f = getattr(self._selected_obj, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))
Expand Down Expand Up @@ -503,7 +523,19 @@ def curried(x):
try:
return self.apply(curried_with_axis)
except Exception:
return self.apply(curried)
try:
return self.apply(curried)
except Exception:

# related to : GH3688
# try item-by-item
# this can be called recursively, so need to raise ValueError if
# we don't have this method to indicated to aggregate to
# mark this column as an error
try:
return self._aggregate_item_by_item(name, *args, **kwargs)
except (AttributeError):
raise ValueError

return wrapper

Expand Down Expand Up @@ -624,6 +656,7 @@ def mean(self):
except GroupByError:
raise
except Exception: # pragma: no cover
self._set_selection_from_grouper()
f = lambda x: x.mean(axis=self.axis)
return self._python_agg_general(f)

Expand All @@ -639,6 +672,7 @@ def median(self):
raise
except Exception: # pragma: no cover

self._set_selection_from_grouper()
def f(x):
if isinstance(x, np.ndarray):
x = Series(x)
Expand All @@ -655,6 +689,7 @@ def std(self, ddof=1):
if ddof == 1:
return self._cython_agg_general('std')
else:
self._set_selection_from_grouper()
f = lambda x: x.std(ddof=ddof)
return self._python_agg_general(f)

Expand All @@ -667,15 +702,26 @@ def var(self, ddof=1):
if ddof == 1:
return self._cython_agg_general('var')
else:
self._set_selection_from_grouper()
f = lambda x: x.var(ddof=ddof)
return self._python_agg_general(f)

def size(self):
"""
Compute group sizes
"""
return self.grouper.size()

def count(self, axis=0):
"""
Number of non-null items in each group.
axis : axis number, default 0
the grouping axis
"""
self._set_selection_from_grouper()
return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')

sum = _groupby_function('sum', 'add', np.sum)
prod = _groupby_function('prod', 'prod', np.prod)
min = _groupby_function('min', 'min', np.min, numeric_only=False)
Expand All @@ -685,14 +731,14 @@ def size(self):
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
_convert=True)


def ohlc(self):
"""
Compute sum of values, excluding missing values
For multiple groupings, the result index will be a MultiIndex
"""
return self._cython_agg_general('ohlc')
return self._apply_to_column_groupbys(
lambda x: x._cython_agg_general('ohlc'))

def nth(self, n, dropna=None):
"""
Expand Down Expand Up @@ -888,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs):
cumcounts[v] = arr[len(v)-1::-1]
return cumcounts

@cache_readonly
def _selected_obj(self):
if self._selection is None or isinstance(self.obj, Series):
return self.obj
else:
return self.obj[self._selection]

def _index_with_as_index(self, b):
"""
Take boolean mask of index to be returned from apply, if as_index=True
Expand Down Expand Up @@ -990,12 +1029,23 @@ def _concat_objects(self, keys, values, not_indexed_same=False):
result = result.reindex(ax)
else:
result = result.reindex_axis(ax, axis=self.axis)
elif self.group_keys and self.as_index:
group_keys = keys
group_levels = self.grouper.levels
group_names = self.grouper.names
result = concat(values, axis=self.axis, keys=group_keys,
levels=group_levels, names=group_names)

elif self.group_keys:

if self.as_index:

# possible MI return case
group_keys = keys
group_levels = self.grouper.levels
group_names = self.grouper.names
result = concat(values, axis=self.axis, keys=group_keys,
levels=group_levels, names=group_names)
else:

# GH5610, returns a MI, with the first level being a
# range index
keys = list(range(len(values)))
result = concat(values, axis=self.axis, keys=keys)
else:
result = concat(values, axis=self.axis)

Expand Down Expand Up @@ -2187,6 +2237,9 @@ def true_and_notnull(x, *args, **kwargs):
filtered = self._apply_filter(indices, dropna)
return filtered

def _apply_to_column_groupbys(self, func):
""" return a pass thru """
return func(self)

class NDFrameGroupBy(GroupBy):

Expand Down Expand Up @@ -2486,6 +2539,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
elif hasattr(self.grouper, 'groupings'):
if len(self.grouper.groupings) > 1:
key_index = MultiIndex.from_tuples(keys, names=key_names)

else:
ping = self.grouper.groupings[0]
if len(keys) == ping.ngroups:
Expand All @@ -2498,8 +2552,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
# reorder the values
values = [values[i] for i in indexer]
else:

key_index = Index(keys, name=key_names[0])

# don't use the key indexer
if not self.as_index:
key_index = None

# make Nones an empty object
if com._count_not_none(*values) != len(values):
v = next(v for v in values if v is not None)
Expand Down Expand Up @@ -2569,7 +2628,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# normally use vstack as its faster than concat
# and if we have mi-columns
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None:
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values,index=key_index,columns=index)
else:
Expand Down Expand Up @@ -2889,16 +2948,6 @@ def _apply_to_column_groupbys(self, func):
in self._iterate_column_groupbys()),
keys=self._selected_obj.columns, axis=1)

def ohlc(self):
"""
Compute sum of values, excluding missing values
For multiple groupings, the result index will be a MultiIndex
"""
return self._apply_to_column_groupbys(
lambda x: x._cython_agg_general('ohlc'))


from pandas.tools.plotting import boxplot_frame_groupby
DataFrameGroupBy.boxplot = boxplot_frame_groupby

Expand Down
17 changes: 2 additions & 15 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,12 @@ def __array__(self, result=None):
""" the array interface, return my values """
return self.values

def __array_wrap__(self, result):
def __array_wrap__(self, result, copy=False):
"""
Gets called prior to a ufunc (and after)
"""
return self._constructor(result, index=self.index,
copy=False).__finalize__(self)
copy=copy).__finalize__(self)

def __contains__(self, key):
return key in self.index
Expand Down Expand Up @@ -959,19 +959,6 @@ def iteritems(self):
if compat.PY3: # pragma: no cover
items = iteritems

# inversion
def __neg__(self):
values = self.values
if values.dtype == np.bool_:
arr = operator.inv(values)
else:
arr = operator.neg(values)
return self._constructor(arr, self.index).__finalize__(self)

def __invert__(self):
arr = operator.inv(self.values)
return self._constructor(arr, self.index).__finalize__(self)

#----------------------------------------------------------------------
# unbox reductions

Expand Down
Loading

0 comments on commit d2ead2c

Please sign in to comment.