From 0cd97b34476d2d4adcbbc8abd6c38bf1fd902a21 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 11 Aug 2020 09:24:49 -0400 Subject: [PATCH 1/5] Include key columns only in DataFrameGroupBy.nunique(), not SeriesGroupBy --- python/cudf/cudf/core/groupby/groupby.py | 14 +++++++------- python/cudf/cudf/tests/test_groupby.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 70341593f19..d2e98213b91 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -217,13 +217,6 @@ def nth(self, n): sizes = self.size() return result[n < sizes] - def nunique(self): - """ - Return the number of unique values per group. - """ - # Pandas includes key columns for nunique: - return self.agg(dict.fromkeys(self.obj._data.keys(), "nunique")) - def serialize(self): header = {} frames = [] @@ -630,6 +623,13 @@ def __getattribute__(self, key): def __getitem__(self, key): return self.obj[key].groupby(self.grouping, dropna=self._dropna) + def nunique(self): + """ + Return the number of unique values per group + """ + # For DataFrameGroupBy, Pandas includes key columns for nunique: + return self.agg(dict.fromkeys(self.obj._data.keys(), "nunique")) + class SeriesGroupBy(GroupBy): def __init__( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 16547aa4f43..ead434318c9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1196,3 +1196,14 @@ def test_groupby_groups_multi(by): for key in pdg.groups: assert key in gdg.groups assert_eq(pdg.groups[key], gdg.groups[key]) + + +def test_groupby_nunique_series(): + pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.groupby("a")["b"].nunique(), + gdf.groupby("a")["b"].nunique(), + check_dtype=False, + ) From ac8f706f203bff1bb60718153848d6f7b2432e99 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 11 Aug 2020 09:29:01 -0400 Subject: [PATCH 2/5] Changelog --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09bbc334f52..93fd9261c53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +# cuDF 0.16.0 (Date TBD) + +## New Features + +## Improvements + +## Bug Fixes + +- PR #5921 Fix SeriesGroupBy.nunique() to return a Series + + # cuDF 0.15.0 (Date TBD) ## New Features From dbce5cba42879789e665ff253dc8ef63ad6df6ab Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 11 Aug 2020 11:26:33 -0400 Subject: [PATCH 3/5] Move changelog entry --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93fd9261c53..efceeabf7ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,6 @@ ## Bug Fixes -- PR #5921 Fix SeriesGroupBy.nunique() to return a Series - # cuDF 0.15.0 (Date TBD) @@ -249,6 +247,7 @@ - PR #5319 Disallow SUM and specialize MEAN of timestamp types - PR #5797 Fix a missing data issue in some Parquet files - PR #5787 Fix column create from dictionary column view +- PR #5921 Fix SeriesGroupBy.nunique() to return a Series # cuDF 0.14.0 (03 Jun 2020) From 681bba9f026087fb30a491deb688d4522fc8771e Mon Sep 17 00:00:00 2001 From: Keith Kraus Date: Tue, 11 Aug 2020 11:36:46 -0400 Subject: [PATCH 4/5] Remove 0.16 changelog section --- CHANGELOG.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36852fa5205..43c7da9e484 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,3 @@ -# cuDF 0.16.0 (Date TBD) - -## New Features - -## Improvements - -## Bug Fixes - - # cuDF 0.15.0 (Date TBD) ## New Features From 46cfe9c75e75b76f83823ca8741c31ea1310e684 Mon Sep 17 00:00:00 2001 From: Keith Kraus Date: Tue, 11 Aug 2020 12:11:33 -0400 Subject: [PATCH 5/5] Fix changelog # --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43c7da9e484..dbde4aa9f74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -266,7 +266,7 @@ - PR #5319 Disallow SUM and specialize MEAN of timestamp types - PR #5797 Fix a missing data issue in some Parquet files - PR #5787 Fix column create from dictionary column view -- PR #5921 Fix SeriesGroupBy.nunique() to return a Series +- PR #5926 Fix SeriesGroupBy.nunique() to return a Series - PR #5813 Fix normalizer exception with all-null strings column - PR #5820 Fix ListColumn.to_arrow for all null case - PR #5837 Bash syntax error in prebuild.sh preventing `cudf_kafka` and `libcudf_kafka` from being uploaded to Anaconda