From 388c314a5cb402f6456b433140c28eb511dd6601 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:59:35 -0700
Subject: [PATCH 01/10] Fix nunique for all null case and mulitiindex

---
 cpp/src/stream_compaction/distinct_count.cu  |  9 ++++++---
 python/cudf/cudf/core/dataframe.py           |  8 +++++---
 python/cudf/cudf/core/frame.py               |  7 +++----
 python/cudf/cudf/core/index.py               |  2 +-
 python/cudf/cudf/core/multiindex.py          |  8 ++++++++
 python/cudf/cudf/core/single_column_frame.py |  4 +---
 python/cudf/cudf/tests/test_dataframe.py     | 14 ++++++++++++++
 python/cudf/cudf/tests/test_multiindex.py    | 11 +++++++++++
 python/cudf/cudf/tests/test_series.py        | 10 ++++++++++
 9 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index b7aadbe14fa..c667bae80fc 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -187,13 +187,16 @@ cudf::size_type distinct_count(column_view const& input,
                                nan_policy nan_handling,
                                rmm::cuda_stream_view stream)
 {
-  if (0 == input.size() or input.null_count() == input.size()) { return 0; }
+  if (0 == input.size()) { return 0; }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
 
-  // Check for nulls. If the null policy is EXCLUDE and null values were found,
-  // we decrement the count.
+  // Check for nulls.
   auto const has_null = input.has_nulls();
+  // If the null policy is INCLUDE and all the values were null, return count (1).
+  if (null_handling == null_policy::INCLUDE and has_null and count == 1) { return count; }
+  // If the null policy is EXCLUDE and null values were found,
+  // we decrement the count.
   if (null_handling == null_policy::EXCLUDE and has_null) { --count; }
 
   // Check for NaNs. There are two cases that can lead to decrementing the
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e1b6cc45dd3..753d3eda1f7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7475,7 +7475,7 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
-    def nunique(self, axis=0, dropna=True):
+    def nunique(self, axis=0, dropna: bool = True) -> Series:
         """
         Count number of distinct elements in specified axis.
         Return Series with number of distinct elements. Can ignore NaN values.
@@ -7503,8 +7503,10 @@ def nunique(self, axis=0, dropna=True):
         """
         if axis != 0:
             raise NotImplementedError("axis parameter is not supported yet.")
-
-        return cudf.Series(super().nunique(dropna=dropna))
+        counts = [col.distinct_count(dropna=dropna) for col in self._columns]
+        return self._constructor_sliced(
+            counts, index=self._data.to_pandas_index()
+        )
 
     def _sample_axis_1(
         self,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index af8886a44a6..6b8dc880f72 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1940,10 +1940,9 @@ def nunique(self, dropna: bool = True):
         dict
             Name and unique value counts of each column in frame.
         """
-        return {
-            name: col.distinct_count(dropna=dropna)
-            for name, col in self._data.items()
-        }
+        raise NotImplementedError(
+            f"{type(self).__name__} does not implement nunique"
+        )
 
     @staticmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 732e5cdb01a..be95c11e336 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -895,7 +895,7 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self) -> int:
+    def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 11b4b9154a2..af568b34e08 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1701,6 +1701,14 @@ def fillna(self, value):
     def unique(self):
         return self.drop_duplicates(keep="first")
 
+    @_cudf_nvtx_annotate
+    def nunique(self, dropna: bool = True) -> int:
+        if dropna:
+            mi = self.dropna(how="all")
+        else:
+            mi = self
+        return len(mi.unique())
+
     def _clean_nulls_from_index(self):
         """
         Convert all na values(if any) in MultiIndex object
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index d864b563208..a17a2f9665c 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -335,7 +335,7 @@ def _make_operands_for_binop(
         return {result_name: (self._column, other, reflect, fill_value)}
 
     @_cudf_nvtx_annotate
-    def nunique(self, dropna: bool = True):
+    def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
 
@@ -349,8 +349,6 @@ def nunique(self, dropna: bool = True):
         int
             Number of unique values in the column.
         """
-        if self._column.null_count == len(self):
-            return 0
         return self._column.distinct_count(dropna=dropna)
 
     def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 98e9f9881c7..49d1993225c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "columns",
+    [
+        pd.RangeIndex(2, name="foo"),
+        pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]),
+        pd.Index([3, 5], dtype=np.int8, name="foo"),
+    ],
+)
+def test_nunique_preserve_column_in_index(columns):
+    df = cudf.DataFrame([[1, 2]], columns=columns)
+    result = df.nunique().index.to_pandas()
+    pd.testing.assert_index_equal(result, columns, exact=True)
+
+
 @pytest.mark.parametrize(
     "data",
     [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index f143112a45f..6f20e67995b 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable():
         lfunc_args_and_kwargs=((),),
         rfunc_args_and_kwargs=((),),
     )
+
+
+@pytest.mark.parametrize("array", [[1, 2], [1, np.nan], [np.nan] * 2])
+@pytest.mark.parametrize("dropna", [True, False])
+def test_nunique(array, dropna):
+    arrays = [array, [3, 4]]
+    gidx = cudf.MultiIndex.from_arrays(arrays)
+    pidx = pd.MultiIndex.from_arrays(arrays)
+    result = gidx.nunique(dropna=dropna)
+    expected = pidx.nunique(dropna=dropna)
+    assert result == expected
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f47c42d9a1d..5979a234805 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2848,3 +2848,13 @@ def test_nans_to_nulls_noop_copies_column(value):
     ser1 = cudf.Series([value])
     ser2 = ser1.nans_to_nulls()
     assert ser1._column is not ser2._column
+
+
+@pytest.mark.parametrize("dropna", [False, True])
+def test_nunique_all_null(dropna):
+    data = [np.nan, np.nan]
+    pd_ser = pd.Series(data)
+    cudf_ser = cudf.Series(data, nan_as_null=True)
+    result = pd_ser.nunique(dropna=dropna)
+    expected = cudf_ser.nunique(dropna=dropna)
+    assert result == expected

From a30529fedc986da611025cc159be9e08be23db28 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 14:52:25 -0700
Subject: [PATCH 02/10] Do quick checks before calling distinct count

---
 cpp/src/stream_compaction/distinct_count.cu | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index c667bae80fc..bad3ee4930e 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -189,12 +189,19 @@ cudf::size_type distinct_count(column_view const& input,
 {
   if (0 == input.size()) { return 0; }
 
+  if (input.null_count() == input.size()) {
+    if (null_handling == null_policy::INCLUDE) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
 
-  // Check for nulls.
+  // Check for nulls. If the null policy is EXCLUDE and null values were found,
+  // we decrement the count.
   auto const has_null = input.has_nulls();
-  // If the null policy is INCLUDE and all the values were null, return count (1).
-  if (null_handling == null_policy::INCLUDE and has_null and count == 1) { return count; }
   // If the null policy is EXCLUDE and null values were found,
   // we decrement the count.
   if (null_handling == null_policy::EXCLUDE and has_null) { --count; }

From 97e556deee69ff8b92e6b6687e7f7e1de45dc7b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 14:53:47 -0700
Subject: [PATCH 03/10] remove comment

---
 cpp/src/stream_compaction/distinct_count.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index bad3ee4930e..dd72a793917 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -202,8 +202,6 @@ cudf::size_type distinct_count(column_view const& input,
   // Check for nulls. If the null policy is EXCLUDE and null values were found,
   // we decrement the count.
   auto const has_null = input.has_nulls();
-  // If the null policy is EXCLUDE and null values were found,
-  // we decrement the count.
   if (null_handling == null_policy::EXCLUDE and has_null) { --count; }
 
   // Check for NaNs. There are two cases that can lead to decrementing the

From 7bb06f175f1af671fd8f35ad3f1689e04f31b057 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 10:31:19 -0700
Subject: [PATCH 04/10] Update python/cudf/cudf/tests/test_dataframe.py

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 python/cudf/cudf/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 49d1993225c..649821b9b7c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9977,7 +9977,7 @@ def test_dataframe_nunique(data):
 def test_nunique_preserve_column_in_index(columns):
     df = cudf.DataFrame([[1, 2]], columns=columns)
     result = df.nunique().index.to_pandas()
-    pd.testing.assert_index_equal(result, columns, exact=True)
+    assert_eq(result, columns, exact=True)
 
 
 @pytest.mark.parametrize(

From 56309dea6b31518b8b7f8a57a18d2e9a5e379e74 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 10:49:23 -0700
Subject: [PATCH 05/10] Use None instead of NaN

---
 python/cudf/cudf/tests/test_multiindex.py | 2 +-
 python/cudf/cudf/tests/test_series.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 6f20e67995b..2e41a30f870 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2164,7 +2164,7 @@ def test_multi_index_contains_hashable():
     )
 
 
-@pytest.mark.parametrize("array", [[1, 2], [1, np.nan], [np.nan] * 2])
+@pytest.mark.parametrize("array", [[1, 2], [1, None], [None] * 2])
 @pytest.mark.parametrize("dropna", [True, False])
 def test_nunique(array, dropna):
     arrays = [array, [3, 4]]
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f260e122c0e..52956c230ba 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2855,9 +2855,9 @@ def test_nans_to_nulls_noop_copies_column(value):
 
 @pytest.mark.parametrize("dropna", [False, True])
 def test_nunique_all_null(dropna):
-    data = [np.nan, np.nan]
+    data = [None, None]
     pd_ser = pd.Series(data)
-    cudf_ser = cudf.Series(data, nan_as_null=True)
+    cudf_ser = cudf.Series(data)
     result = pd_ser.nunique(dropna=dropna)
     expected = cudf_ser.nunique(dropna=dropna)
     assert result == expected

From 61ad4af3bf1f95c830c7667003f26feb3a5e77ad Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 09:52:08 -0700
Subject: [PATCH 06/10] Update cpp/src/stream_compaction/distinct_count.cu

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/stream_compaction/distinct_count.cu | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index dd72a793917..83585f658b1 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -189,12 +189,7 @@ cudf::size_type distinct_count(column_view const& input,
 {
   if (0 == input.size()) { return 0; }
 
-  if (input.null_count() == input.size()) {
-    if (null_handling == null_policy::INCLUDE) {
-      return 1;
-    } else {
-      return 0;
-    }
+  if (input.null_count() == input.size()) { return static_cast<size_type>(null_handling == null_policy::INCLUDE); }
   }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);

From 7c680ef697c6a168170edd40d8e58e8ef4dc0b30 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 09:52:15 -0700
Subject: [PATCH 07/10] Update python/cudf/cudf/tests/test_multiindex.py

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/tests/test_multiindex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2e41a30f870..7b95e4f9a44 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2164,7 +2164,7 @@ def test_multi_index_contains_hashable():
     )
 
 
-@pytest.mark.parametrize("array", [[1, 2], [1, None], [None] * 2])
+@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]])
 @pytest.mark.parametrize("dropna", [True, False])
 def test_nunique(array, dropna):
     arrays = [array, [3, 4]]

From 439a67d0b297bd0b114fe2af15c58850ae01f11d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 09:52:23 -0700
Subject: [PATCH 08/10] Update python/cudf/cudf/core/multiindex.py

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/multiindex.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index f062a49f68b..9c387d94910 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1748,10 +1748,7 @@ def unique(self):
 
     @_cudf_nvtx_annotate
     def nunique(self, dropna: bool = True) -> int:
-        if dropna:
-            mi = self.dropna(how="all")
-        else:
-            mi = self
+        mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
 
     def _clean_nulls_from_index(self):

From 0a39911053ebba8e3eef16271f6a8012fb2635ca Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:14:32 -0700
Subject: [PATCH 09/10] Update cpp/src/stream_compaction/distinct_count.cu

---
 cpp/src/stream_compaction/distinct_count.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 83585f658b1..d6a2d5d3b93 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -190,7 +190,6 @@ cudf::size_type distinct_count(column_view const& input,
   if (0 == input.size()) { return 0; }
 
   if (input.null_count() == input.size()) { return static_cast<size_type>(null_handling == null_policy::INCLUDE); }
-  }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
 

From b203ea10ac82d94f33ad9042b9570d46277cd51d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:19:55 -0700
Subject: [PATCH 10/10] pre-commit

---
 cpp/src/stream_compaction/distinct_count.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index d6a2d5d3b93..99ca89cc021 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -189,7 +189,9 @@ cudf::size_type distinct_count(column_view const& input,
 {
   if (0 == input.size()) { return 0; }
 
-  if (input.null_count() == input.size()) { return static_cast<size_type>(null_handling == null_policy::INCLUDE); }
+  if (input.null_count() == input.size()) {
+    return static_cast<size_type>(null_handling == null_policy::INCLUDE);
+  }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);