From 388c314a5cb402f6456b433140c28eb511dd6601 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:59:35 -0700 Subject: [PATCH 01/10] Fix nunique for all null case and mulitiindex --- cpp/src/stream_compaction/distinct_count.cu | 9 ++++++--- python/cudf/cudf/core/dataframe.py | 8 +++++--- python/cudf/cudf/core/frame.py | 7 +++---- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/multiindex.py | 8 ++++++++ python/cudf/cudf/core/single_column_frame.py | 4 +--- python/cudf/cudf/tests/test_dataframe.py | 14 ++++++++++++++ python/cudf/cudf/tests/test_multiindex.py | 11 +++++++++++ python/cudf/cudf/tests/test_series.py | 10 ++++++++++ 9 files changed, 59 insertions(+), 14 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index b7aadbe14fa..c667bae80fc 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -187,13 +187,16 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() or input.null_count() == input.size()) { return 0; } + if (0 == input.size()) { return 0; } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); - // Check for nulls. If the null policy is EXCLUDE and null values were found, - // we decrement the count. + // Check for nulls. auto const has_null = input.has_nulls(); + // If the null policy is INCLUDE and all the values were null, return count (1). + if (null_handling == null_policy::INCLUDE and has_null and count == 1) { return count; } + // If the null policy is EXCLUDE and null values were found, + // we decrement the count. if (null_handling == null_policy::EXCLUDE and has_null) { --count; } // Check for NaNs. There are two cases that can lead to decrementing the diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e1b6cc45dd3..753d3eda1f7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7475,7 +7475,7 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. Return Series with number of distinct elements. Can ignore NaN values. @@ -7503,8 +7503,10 @@ def nunique(self, axis=0, dropna=True): """ if axis != 0: raise NotImplementedError("axis parameter is not supported yet.") - - return cudf.Series(super().nunique(dropna=dropna)) + counts = [col.distinct_count(dropna=dropna) for col in self._columns] + return self._constructor_sliced( + counts, index=self._data.to_pandas_index() + ) def _sample_axis_1( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index af8886a44a6..6b8dc880f72 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1940,10 +1940,9 @@ def nunique(self, dropna: bool = True): dict Name and unique value counts of each column in frame. """ - return { - name: col.distinct_count(dropna=dropna) - for name, col in self._data.items() - } + raise NotImplementedError( + f"{type(self).__name__} does not implement nunique" + ) @staticmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 732e5cdb01a..be95c11e336 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -895,7 +895,7 @@ def __array__(self, dtype=None): ) @_cudf_nvtx_annotate - def nunique(self) -> int: + def nunique(self, dropna: bool = True) -> int: return len(self) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 11b4b9154a2..af568b34e08 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1701,6 +1701,14 @@ def fillna(self, value): def unique(self): return self.drop_duplicates(keep="first") + @_cudf_nvtx_annotate + def nunique(self, dropna: bool = True) -> int: + if dropna: + mi = self.dropna(how="all") + else: + mi = self + return len(mi.unique()) + def _clean_nulls_from_index(self): """ Convert all na values(if any) in MultiIndex object diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d864b563208..a17a2f9665c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -335,7 +335,7 @@ def _make_operands_for_binop( return {result_name: (self._column, other, reflect, fill_value)} @_cudf_nvtx_annotate - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> int: """ Return count of unique values for the column. @@ -349,8 +349,6 @@ def nunique(self, dropna: bool = True): int Number of unique values in the column. """ - if self._column.null_count == len(self): - return 0 return self._column.distinct_count(dropna=dropna) def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 98e9f9881c7..49d1993225c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data): assert_eq(expected, actual) +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(2, name="foo"), + pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), + pd.Index([3, 5], dtype=np.int8, name="foo"), + ], +) +def test_nunique_preserve_column_in_index(columns): + df = cudf.DataFrame([[1, 2]], columns=columns) + result = df.nunique().index.to_pandas() + pd.testing.assert_index_equal(result, columns, exact=True) + + @pytest.mark.parametrize( "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f143112a45f..6f20e67995b 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable(): lfunc_args_and_kwargs=((),), rfunc_args_and_kwargs=((),), ) + + +@pytest.mark.parametrize("array", [[1, 2], [1, np.nan], [np.nan] * 2]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique(array, dropna): + arrays = [array, [3, 4]] + gidx = cudf.MultiIndex.from_arrays(arrays) + pidx = pd.MultiIndex.from_arrays(arrays) + result = gidx.nunique(dropna=dropna) + expected = pidx.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f47c42d9a1d..5979a234805 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2848,3 +2848,13 @@ def test_nans_to_nulls_noop_copies_column(value): ser1 = cudf.Series([value]) ser2 = ser1.nans_to_nulls() assert ser1._column is not ser2._column + + +@pytest.mark.parametrize("dropna", [False, True]) +def test_nunique_all_null(dropna): + data = [np.nan, np.nan] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data, nan_as_null=True) + result = pd_ser.nunique(dropna=dropna) + expected = cudf_ser.nunique(dropna=dropna) + assert result == expected From a30529fedc986da611025cc159be9e08be23db28 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:52:25 -0700 Subject: [PATCH 02/10] Do quick checks before calling distinct count --- cpp/src/stream_compaction/distinct_count.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index c667bae80fc..bad3ee4930e 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -189,12 +189,19 @@ cudf::size_type distinct_count(column_view const& input, { if (0 == input.size()) { return 0; } + if (input.null_count() == input.size()) { + if (null_handling == null_policy::INCLUDE) { + return 1; + } else { + return 0; + } + } + auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); - // Check for nulls. + // Check for nulls. If the null policy is EXCLUDE and null values were found, + // we decrement the count. auto const has_null = input.has_nulls(); - // If the null policy is INCLUDE and all the values were null, return count (1). - if (null_handling == null_policy::INCLUDE and has_null and count == 1) { return count; } // If the null policy is EXCLUDE and null values were found, // we decrement the count. if (null_handling == null_policy::EXCLUDE and has_null) { --count; } From 97e556deee69ff8b92e6b6687e7f7e1de45dc7b1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:53:47 -0700 Subject: [PATCH 03/10] remove comment --- cpp/src/stream_compaction/distinct_count.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index bad3ee4930e..dd72a793917 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -202,8 +202,6 @@ cudf::size_type distinct_count(column_view const& input, // Check for nulls. If the null policy is EXCLUDE and null values were found, // we decrement the count. auto const has_null = input.has_nulls(); - // If the null policy is EXCLUDE and null values were found, - // we decrement the count. if (null_handling == null_policy::EXCLUDE and has_null) { --count; } // Check for NaNs. There are two cases that can lead to decrementing the From 7bb06f175f1af671fd8f35ad3f1689e04f31b057 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:31:19 -0700 Subject: [PATCH 04/10] Update python/cudf/cudf/tests/test_dataframe.py Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 49d1993225c..649821b9b7c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9977,7 +9977,7 @@ def test_dataframe_nunique(data): def test_nunique_preserve_column_in_index(columns): df = cudf.DataFrame([[1, 2]], columns=columns) result = df.nunique().index.to_pandas() - pd.testing.assert_index_equal(result, columns, exact=True) + assert_eq(result, columns, exact=True) @pytest.mark.parametrize( From 56309dea6b31518b8b7f8a57a18d2e9a5e379e74 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:49:23 -0700 Subject: [PATCH 05/10] Use None instead of NaN --- python/cudf/cudf/tests/test_multiindex.py | 2 +- python/cudf/cudf/tests/test_series.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 6f20e67995b..2e41a30f870 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2164,7 +2164,7 @@ def test_multi_index_contains_hashable(): ) -@pytest.mark.parametrize("array", [[1, 2], [1, np.nan], [np.nan] * 2]) +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None] * 2]) @pytest.mark.parametrize("dropna", [True, False]) def test_nunique(array, dropna): arrays = [array, [3, 4]] diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f260e122c0e..52956c230ba 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2855,9 +2855,9 @@ def test_nans_to_nulls_noop_copies_column(value): @pytest.mark.parametrize("dropna", [False, True]) def test_nunique_all_null(dropna): - data = [np.nan, np.nan] + data = [None, None] pd_ser = pd.Series(data) - cudf_ser = cudf.Series(data, nan_as_null=True) + cudf_ser = cudf.Series(data) result = pd_ser.nunique(dropna=dropna) expected = cudf_ser.nunique(dropna=dropna) assert result == expected From 61ad4af3bf1f95c830c7667003f26feb3a5e77ad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 09:52:08 -0700 Subject: [PATCH 06/10] Update cpp/src/stream_compaction/distinct_count.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/stream_compaction/distinct_count.cu | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index dd72a793917..83585f658b1 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -189,12 +189,7 @@ cudf::size_type distinct_count(column_view const& input, { if (0 == input.size()) { return 0; } - if (input.null_count() == input.size()) { - if (null_handling == null_policy::INCLUDE) { - return 1; - } else { - return 0; - } + if (input.null_count() == input.size()) { return static_cast(null_handling == null_policy::INCLUDE); } } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); From 7c680ef697c6a168170edd40d8e58e8ef4dc0b30 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 09:52:15 -0700 Subject: [PATCH 07/10] Update python/cudf/cudf/tests/test_multiindex.py Co-authored-by: Bradley Dice --- python/cudf/cudf/tests/test_multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 2e41a30f870..7b95e4f9a44 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2164,7 +2164,7 @@ def test_multi_index_contains_hashable(): ) -@pytest.mark.parametrize("array", [[1, 2], [1, None], [None] * 2]) +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) @pytest.mark.parametrize("dropna", [True, False]) def test_nunique(array, dropna): arrays = [array, [3, 4]] From 439a67d0b297bd0b114fe2af15c58850ae01f11d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 09:52:23 -0700 Subject: [PATCH 08/10] Update python/cudf/cudf/core/multiindex.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/multiindex.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f062a49f68b..9c387d94910 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1748,10 +1748,7 @@ def unique(self): @_cudf_nvtx_annotate def nunique(self, dropna: bool = True) -> int: - if dropna: - mi = self.dropna(how="all") - else: - mi = self + mi = self.dropna(how="all") if dropna else self return len(mi.unique()) def _clean_nulls_from_index(self): From 0a39911053ebba8e3eef16271f6a8012fb2635ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:14:32 -0700 Subject: [PATCH 09/10] Update cpp/src/stream_compaction/distinct_count.cu --- cpp/src/stream_compaction/distinct_count.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 83585f658b1..d6a2d5d3b93 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -190,7 +190,6 @@ cudf::size_type distinct_count(column_view const& input, if (0 == input.size()) { return 0; } if (input.null_count() == input.size()) { return static_cast(null_handling == null_policy::INCLUDE); } - } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); From b203ea10ac82d94f33ad9042b9570d46277cd51d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:19:55 -0700 Subject: [PATCH 10/10] pre-commit --- cpp/src/stream_compaction/distinct_count.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index d6a2d5d3b93..99ca89cc021 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -189,7 +189,9 @@ cudf::size_type distinct_count(column_view const& input, { if (0 == input.size()) { return 0; } - if (input.null_count() == input.size()) { return static_cast(null_handling == null_policy::INCLUDE); } + if (input.null_count() == input.size()) { + return static_cast(null_handling == null_policy::INCLUDE); + } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);