Skip to content

Commit

Permalink
REF: remove _get_grouper, make Grouper.__init__ less stateful (#51155)
Browse files Browse the repository at this point in the history
* REF: inline BaseGrouper/BinGrouper._get_grouper

* REF: set grouping_vector once at the end of __init__

* REF: make passed_categorical a cache_readonly

* mypy fixup
  • Loading branch information
jbrockmendel authored Feb 6, 2023
1 parent ad03e49 commit 9bd15db
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 51 deletions.
68 changes: 35 additions & 33 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,6 @@ class Grouping:

_codes: npt.NDArray[np.signedinteger] | None = None
_group_index: Index | None = None
_passed_categorical: bool
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index
Expand All @@ -483,7 +482,7 @@ def __init__(
) -> None:
self.level = level
self._orig_grouper = grouper
self.grouping_vector = _convert_grouper(index, grouper)
grouping_vector = _convert_grouper(index, grouper)
self._all_grouper = None
self._orig_cats = None
self._index = index
Expand All @@ -494,8 +493,6 @@ def __init__(
self._dropna = dropna
self._uniques = uniques

self._passed_categorical = False

# we have a single grouper which may be a myriad of things,
# some of which are dependent on the passing in level

Expand All @@ -509,78 +506,83 @@ def __init__(
else:
index_level = index

if self.grouping_vector is None:
self.grouping_vector = index_level
if grouping_vector is None:
grouping_vector = index_level
else:
mapper = self.grouping_vector
self.grouping_vector = index_level.map(mapper)
mapper = grouping_vector
grouping_vector = index_level.map(mapper)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get codes
elif isinstance(self.grouping_vector, Grouper):
elif isinstance(grouping_vector, Grouper):
# get the new grouper; we already have disambiguated
# what key/level refer to exactly, don't need to
# check again as we have by this point converted these
# to an actual value (rather than a pd.Grouper)
assert self.obj is not None # for mypy
newgrouper, newobj = self.grouping_vector._get_grouper(
self.obj, validate=False
)
newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
self.obj = newobj

ng = newgrouper._get_grouper()
if isinstance(newgrouper, ops.BinGrouper):
# in this case we have `ng is newgrouper`
self.grouping_vector = ng
# TODO: can we unwrap this and get a tighter typing
# for self.grouping_vector?
grouping_vector = newgrouper
else:
# ops.BaseGrouper
# TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
# If that were to occur, would we be throwing out information?
# error: Cannot determine type of "grouping_vector" [has-type]
ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
# use Index instead of ndarray so we can recover the name
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)
grouping_vector = Index(ng, name=newgrouper.result_index.name)

elif not isinstance(
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
):
# no level passed
if getattr(self.grouping_vector, "ndim", 1) != 1:
t = self.name or str(type(self.grouping_vector))
if getattr(grouping_vector, "ndim", 1) != 1:
t = str(type(grouping_vector))
raise ValueError(f"Grouper for '{t}' not 1-dimensional")

self.grouping_vector = index.map(self.grouping_vector)
grouping_vector = index.map(grouping_vector)

if not (
hasattr(self.grouping_vector, "__len__")
and len(self.grouping_vector) == len(index)
hasattr(grouping_vector, "__len__")
and len(grouping_vector) == len(index)
):
grper = pprint_thing(self.grouping_vector)
grper = pprint_thing(grouping_vector)
errmsg = (
"Grouper result violates len(labels) == "
f"len(data)\nresult: {grper}"
)
self.grouping_vector = None # Try for sanity
raise AssertionError(errmsg)

if isinstance(self.grouping_vector, np.ndarray):
if self.grouping_vector.dtype.kind in ["m", "M"]:
if isinstance(grouping_vector, np.ndarray):
if grouping_vector.dtype.kind in ["m", "M"]:
# if we have a date/time-like grouper, make sure that we have
# Timestamps like
# TODO 2022-10-08 we only have one test that gets here and
# values are already in nanoseconds in that case.
self.grouping_vector = Series(self.grouping_vector).to_numpy()
elif is_categorical_dtype(self.grouping_vector):
grouping_vector = Series(grouping_vector).to_numpy()
elif is_categorical_dtype(grouping_vector):
# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
self._orig_cats = grouping_vector.categories
grouping_vector, self._all_grouper = recode_for_groupby(
grouping_vector, sort, observed
)

self.grouping_vector = grouping_vector

def __repr__(self) -> str:
return f"Grouping({self.name})"

def __iter__(self) -> Iterator:
return iter(self.indices)

@cache_readonly
def _passed_categorical(self) -> bool:
return is_categorical_dtype(self.grouping_vector)

@cache_readonly
def name(self) -> Hashable:
ilevel = self._ilevel
Expand Down
18 changes: 0 additions & 18 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,15 +745,6 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:
ids, _, ngroups = self.group_info
return get_splitter(data, ids, ngroups, axis=axis)

def _get_grouper(self):
"""
We are a grouper as part of another's groupings.
We have a specific method of grouping, so cannot
convert to a Index for our grouper.
"""
return self.groupings[0].grouping_vector

@final
@cache_readonly
def group_keys_seq(self):
Expand Down Expand Up @@ -1110,15 +1101,6 @@ def nkeys(self) -> int:
# still matches len(self.groupings), but we can hard-code
return 1

def _get_grouper(self):
"""
We are a grouper as part of another's groupings.
We have a specific method of grouping, so cannot
convert to a Index for our grouper.
"""
return self

def get_iterator(self, data: NDFrame, axis: AxisInt = 0):
"""
Groupby iterator
Expand Down

0 comments on commit 9bd15db

Please sign in to comment.