From 7291d3af443e52dc4b280e7462850535eb7bf79b Mon Sep 17 00:00:00 2001
From: martinfalisse <martinmotteditfalisse@gmail.com>
Date: Thu, 27 Jan 2022 12:29:14 +0100
Subject: [PATCH 1/3] Add Dataframe and Index nunique

---
 python/cudf/cudf/core/dataframe.py           | 31 ++++++++++++++++++++
 python/cudf/cudf/core/frame.py               | 21 +++++++++++++
 python/cudf/cudf/core/series.py              |  2 +-
 python/cudf/cudf/core/single_column_frame.py | 18 ++++++++++++
 python/cudf/cudf/tests/test_dataframe.py     | 24 +++++++++++++++
 python/cudf/cudf/tests/test_series.py        | 24 +++++++++++++++
 6 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c686cd0fd39..f9fef7dc4dc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6145,6 +6145,37 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Count number of distinct elements in specified axis.
+        Return Series with number of distinct elements. Can ignore NaN values.
+        
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    2
+        dtype: int64
+        """
+        if axis != 0:
+            raise NotImplementedError(
+                "axis parameter is not supported yet."
+            )
+
+        return cudf.Series(super().nunique(method="sort", dropna=dropna))
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 1d59d9f3b1a..6142f0f0f40 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6420,6 +6420,27 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="ge", fill_value=fill_value, can_reindex=True
         )
 
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Returns a per column mapping with counts of unique values for
+        each column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        dict
+            Name and unique value counts of each column in frame.
+        """
+        return {
+            name: col.distinct_count(method=method, dropna=dropna) 
+                for name, col in self._data.items()
+        }
 
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e96531d4b1c..0032dc25cee 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2947,7 +2947,7 @@ def nunique(self, method="sort", dropna=True):
             raise NotImplementedError(msg)
         if self.null_count == len(self):
             return 0
-        return self._column.distinct_count(method, dropna)
+        return super().nunique(method, dropna)
 
     def value_counts(
         self,
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7793a2fdf29..85a85b1930f 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -336,3 +336,21 @@ def _make_operands_for_binop(
                 return NotImplemented
 
         return {result_name: (self._column, other, reflect, fill_value)}
+
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Returns count of unique values for the column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        int
+            Number of unique values in the column.
+        """
+        return sum(super().nunique(method=method, dropna=dropna).values())
\ No newline at end of file
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3e359335719..d81451e9d54 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9077,3 +9077,27 @@ def test_dataframe_assign_cp_np_array():
     gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray
 
     assert_eq(pdf, gdf)
+
+@pytest.mark.parametrize(
+    "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
+)
+def test_dataframe_nunique(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.nunique()
+    expected = pdf.nunique()
+
+    assert_eq(expected, actual)
+
+@pytest.mark.parametrize(
+    "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
+)
+def test_dataframe_nunique_index(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.index.nunique()
+    expected = pdf.index.nunique()
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ffdd53c58ac..ee891828b3c 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1517,3 +1517,27 @@ def test_series_transpose(data):
     assert_eq(pd_transposed, cudf_transposed)
     assert_eq(pd_property, cudf_property)
     assert_eq(cudf_transposed, csr)
+
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.nunique()
+    expected = pd_s.nunique()
+
+    assert_eq(expected, actual)
+    
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique_index(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.index.nunique()
+    expected = pd_s.index.nunique()
+
+    assert_eq(expected, actual)
\ No newline at end of file

From 90c2c0ea1c4a642d19da9afb2754462f3fb21ac1 Mon Sep 17 00:00:00 2001
From: martinfalisse <45781926+martinfalisse@users.noreply.github.com>
Date: Mon, 31 Jan 2022 12:55:34 +0100
Subject: [PATCH 2/3] Use distinct_count instead of super().nunique in
 single_column_frame

Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
---
 python/cudf/cudf/core/single_column_frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 85a85b1930f..e480e31bc4b 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -353,4 +353,4 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         int
             Number of unique values in the column.
         """
-        return sum(super().nunique(method=method, dropna=dropna).values())
\ No newline at end of file
+        return self._column.distinct_count(method=method, dropna=dropna)
\ No newline at end of file

From bcd84ad76fd0f2676d3ced0db58dc31c018addcb Mon Sep 17 00:00:00 2001
From: martinfalisse <martinmotteditfalisse@gmail.com>
Date: Wed, 2 Feb 2022 22:43:58 +0100
Subject: [PATCH 3/3] Remove unnecessary nunique function in series.

---
 python/cudf/cudf/core/dataframe.py           | 10 +++++-----
 python/cudf/cudf/core/frame.py               |  6 ++++--
 python/cudf/cudf/core/single_column_frame.py |  5 +++--
 python/cudf/cudf/tests/test_dataframe.py     |  3 +--
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bb9cd9b5cc7..3735a949277 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6031,11 +6031,12 @@ def nunique(self, axis=0, dropna=True):
         """
         Count number of distinct elements in specified axis.
         Return Series with number of distinct elements. Can ignore NaN values.
-        
+
         Parameters
         ----------
         axis : {0 or 'index', 1 or 'columns'}, default 0
-            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+            column-wise.
         dropna : bool, default True
             Don't include NaN in the counts.
 
@@ -6053,12 +6054,11 @@ def nunique(self, axis=0, dropna=True):
         dtype: int64
         """
         if axis != 0:
-            raise NotImplementedError(
-                "axis parameter is not supported yet."
-            )
+            raise NotImplementedError("axis parameter is not supported yet.")
 
         return cudf.Series(super().nunique(method="sort", dropna=dropna))
 
+
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index a05986555b1..7e97d655147 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import copy
 import pickle
 import warnings
@@ -6420,10 +6421,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
             Name and unique value counts of each column in frame.
         """
         return {
-            name: col.distinct_count(method=method, dropna=dropna) 
-                for name, col in self._data.items()
+            name: col.distinct_count(method=method, dropna=dropna)
+            for name, col in self._data.items()
         }
 
+
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
 ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]:
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index ea1917acc10..ef479f19363 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import builtins
 from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
@@ -328,7 +329,7 @@ def _make_operands_for_binop(
 
     def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         """
-        Returns count of unique values for the column.
+        Return count of unique values for the column.
 
         Parameters
         ----------
@@ -342,4 +343,4 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         int
             Number of unique values in the column.
         """
-        return self._column.distinct_count(method=method, dropna=dropna)
\ No newline at end of file
+        return self._column.distinct_count(method=method, dropna=dropna)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d0a25fd3e8c..ba2caf7c6c8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9095,7 +9095,7 @@ def test_dataframe_nunique(data):
 
 
 @pytest.mark.parametrize(
-    "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
+    "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
 )
 def test_dataframe_nunique_index(data):
     gdf = cudf.DataFrame(data)
@@ -9113,4 +9113,3 @@ def test_dataframe_rename_duplicate_column():
         ValueError, match="Duplicate column names are not allowed"
     ):
         gdf.rename(columns={"a": "b"}, inplace=True)
-