Skip to content

Commit

Permalink
Merge pull request #5926 from shwina/fix-groupby-nunique-series-redux
Browse files Browse the repository at this point in the history
[REVIEW] Fix SeriesGroupBy.nunique() to return a Series
  • Loading branch information
Keith Kraus authored Aug 13, 2020
2 parents e3e5b53 + 46cfe9c commit d24d0ab
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@
- PR #5319 Disallow SUM and specialize MEAN of timestamp types
- PR #5797 Fix a missing data issue in some Parquet files
- PR #5787 Fix column create from dictionary column view
- PR #5926 Fix SeriesGroupBy.nunique() to return a Series
- PR #5813 Fix normalizer exception with all-null strings column
- PR #5820 Fix ListColumn.to_arrow for all null case
- PR #5837 Bash syntax error in prebuild.sh preventing `cudf_kafka` and `libcudf_kafka` from being uploaded to Anaconda
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,6 @@ def nth(self, n):
sizes = self.size()
return result[n < sizes]

def nunique(self):
"""
Return the number of unique values per group.
"""
# Pandas includes key columns for nunique:
return self.agg(dict.fromkeys(self.obj._data.keys(), "nunique"))

def serialize(self):
header = {}
frames = []
Expand Down Expand Up @@ -630,6 +623,13 @@ def __getattribute__(self, key):
def __getitem__(self, key):
return self.obj[key].groupby(self.grouping, dropna=self._dropna)

def nunique(self):
"""
Return the number of unique values per group
"""
# For DataFrameGroupBy, Pandas includes key columns for nunique:
return self.agg(dict.fromkeys(self.obj._data.keys(), "nunique"))


class SeriesGroupBy(GroupBy):
def __init__(
Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,3 +1196,14 @@ def test_groupby_groups_multi(by):
for key in pdg.groups:
assert key in gdg.groups
assert_eq(pdg.groups[key], gdg.groups[key])


def test_groupby_nunique_series():
pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]})
gdf = cudf.from_pandas(pdf)

assert_eq(
pdf.groupby("a")["b"].nunique(),
gdf.groupby("a")["b"].nunique(),
check_dtype=False,
)

0 comments on commit d24d0ab

Please sign in to comment.