From 7291d3af443e52dc4b280e7462850535eb7bf79b Mon Sep 17 00:00:00 2001 From: martinfalisse Date: Thu, 27 Jan 2022 12:29:14 +0100 Subject: [PATCH 1/3] Add Dataframe and Index nunique --- python/cudf/cudf/core/dataframe.py | 31 ++++++++++++++++++++ python/cudf/cudf/core/frame.py | 21 +++++++++++++ python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/core/single_column_frame.py | 18 ++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 24 +++++++++++++++ python/cudf/cudf/tests/test_series.py | 24 +++++++++++++++ 6 files changed, 119 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c686cd0fd39..f9fef7dc4dc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6145,6 +6145,37 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + def nunique(self, axis=0, dropna=True): + """ + Count number of distinct elements in specified axis. + Return Series with number of distinct elements. Can ignore NaN values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + """ + if axis != 0: + raise NotImplementedError( + "axis parameter is not supported yet." + ) + + return cudf.Series(super().nunique(method="sort", dropna=dropna)) def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1d59d9f3b1a..6142f0f0f40 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6420,6 +6420,27 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns a per column mapping with counts of unique values for + each column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + dict + Name and unique value counts of each column in frame. + """ + return { + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() + } def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e96531d4b1c..0032dc25cee 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2947,7 +2947,7 @@ def nunique(self, method="sort", dropna=True): raise NotImplementedError(msg) if self.null_count == len(self): return 0 - return self._column.distinct_count(method, dropna) + return super().nunique(method, dropna) def value_counts( self, diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7793a2fdf29..85a85b1930f 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -336,3 +336,21 @@ def _make_operands_for_binop( return NotImplemented return {result_name: (self._column, other, reflect, fill_value)} + + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns count of unique values for the column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + int + Number of unique values in the column. + """ + return sum(super().nunique(method=method, dropna=dropna).values()) \ No newline at end of file diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3e359335719..d81451e9d54 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9077,3 +9077,27 @@ def test_dataframe_assign_cp_np_array(): gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray assert_eq(pdf, gdf) + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_dataframe_nunique(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) + +@pytest.mark.parametrize( + "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], +) +def test_dataframe_nunique_index(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.index.nunique() + expected = pdf.index.nunique() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ffdd53c58ac..ee891828b3c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1517,3 +1517,27 @@ def test_series_transpose(data): assert_eq(pd_transposed, cudf_transposed) assert_eq(pd_property, cudf_property) assert_eq(cudf_transposed, csr) + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique_index(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.index.nunique() + expected = pd_s.index.nunique() + + assert_eq(expected, actual) \ No newline at end of file From 90c2c0ea1c4a642d19da9afb2754462f3fb21ac1 Mon Sep 17 00:00:00 2001 From: martinfalisse <45781926+martinfalisse@users.noreply.github.com> Date: Mon, 31 Jan 2022 12:55:34 +0100 Subject: [PATCH 2/3] Use distinct_count instead of super().nunique in single_column_frame Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> --- python/cudf/cudf/core/single_column_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 85a85b1930f..e480e31bc4b 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -353,4 +353,4 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): int Number of unique values in the column. """ - return sum(super().nunique(method=method, dropna=dropna).values()) \ No newline at end of file + return self._column.distinct_count(method=method, dropna=dropna) \ No newline at end of file From bcd84ad76fd0f2676d3ced0db58dc31c018addcb Mon Sep 17 00:00:00 2001 From: martinfalisse Date: Wed, 2 Feb 2022 22:43:58 +0100 Subject: [PATCH 3/3] Remove unnecessary nunique function in series. --- python/cudf/cudf/core/dataframe.py | 10 +++++----- python/cudf/cudf/core/frame.py | 6 ++++-- python/cudf/cudf/core/single_column_frame.py | 5 +++-- python/cudf/cudf/tests/test_dataframe.py | 3 +-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bb9cd9b5cc7..3735a949277 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6031,11 +6031,12 @@ def nunique(self, axis=0, dropna=True): """ Count number of distinct elements in specified axis. Return Series with number of distinct elements. Can ignore NaN values. - + Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. dropna : bool, default True Don't include NaN in the counts. @@ -6053,12 +6054,11 @@ def nunique(self, axis=0, dropna=True): dtype: int64 """ if axis != 0: - raise NotImplementedError( - "axis parameter is not supported yet." - ) + raise NotImplementedError("axis parameter is not supported yet.") return cudf.Series(super().nunique(method="sort", dropna=dropna)) + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a05986555b1..7e97d655147 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2,6 +2,7 @@ from __future__ import annotations +import builtins import copy import pickle import warnings @@ -6420,10 +6421,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): Name and unique value counts of each column in frame. """ return { - name: col.distinct_count(method=method, dropna=dropna) - for name, col in self._data.items() + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() } + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index ea1917acc10..ef479f19363 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import builtins from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy @@ -328,7 +329,7 @@ def _make_operands_for_binop( def nunique(self, method: builtins.str = "sort", dropna: bool = True): """ - Returns count of unique values for the column. + Return count of unique values for the column. Parameters ---------- @@ -342,4 +343,4 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): int Number of unique values in the column. """ - return self._column.distinct_count(method=method, dropna=dropna) \ No newline at end of file + return self._column.distinct_count(method=method, dropna=dropna) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d0a25fd3e8c..ba2caf7c6c8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9095,7 +9095,7 @@ def test_dataframe_nunique(data): @pytest.mark.parametrize( - "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], + "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], ) def test_dataframe_nunique_index(data): gdf = cudf.DataFrame(data) @@ -9113,4 +9113,3 @@ def test_dataframe_rename_duplicate_column(): ValueError, match="Duplicate column names are not allowed" ): gdf.rename(columns={"a": "b"}, inplace=True) -