diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 323a5ad088a..3735a949277 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6027,6 +6027,37 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + def nunique(self, axis=0, dropna=True): + """ + Count number of distinct elements in specified axis. + Return Series with number of distinct elements. Can ignore NaN values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + """ + if axis != 0: + raise NotImplementedError("axis parameter is not supported yet.") + + return cudf.Series(super().nunique(method="sort", dropna=dropna)) + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 891f58657b0..7e97d655147 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2,6 +2,7 @@ from __future__ import annotations +import builtins import copy import pickle import warnings @@ -6402,6 +6403,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns a per column mapping with counts of unique values for + each column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + dict + Name and unique value counts of each column in frame. + """ + return { + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() + } + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 66194f0f877..12a2538b776 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2756,7 +2756,7 @@ def nunique(self, method="sort", dropna=True): raise NotImplementedError(msg) if self.null_count == len(self): return 0 - return self._column.distinct_count(method, dropna) + return super().nunique(method, dropna) def value_counts( self, diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 2623569afac..ef479f19363 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import builtins from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy @@ -325,3 +326,21 @@ def _make_operands_for_binop( return NotImplemented return {result_name: (self._column, other, reflect, fill_value)} + + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Return count of unique values for the column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + int + Number of unique values in the column. + """ + return self._column.distinct_count(method=method, dropna=dropna) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 889662c8a1c..ba2caf7c6c8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9081,6 +9081,32 @@ def test_dataframe_assign_cp_np_array(): assert_eq(pdf, gdf) +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_dataframe_nunique(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], +) +def test_dataframe_nunique_index(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.index.nunique() + expected = pdf.index.nunique() + + assert_eq(expected, actual) + + def test_dataframe_rename_duplicate_column(): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) with pytest.raises( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 1c80fe80f2d..358484d79b9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1521,6 +1521,32 @@ def test_series_transpose(data): assert_eq(cudf_transposed, csr) +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique_index(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.index.nunique() + expected = pd_s.index.nunique() + + assert_eq(expected, actual) + + @pytest.mark.parametrize( "fill_value,data", [