Skip to content

Commit

Permalink
Implement value_counts for DataFrame (#10813)
Browse files Browse the repository at this point in the history
Add functionality for value_counts() in DataFrame. Resolves #5169

Authors:
  - https://github.com/martinfalisse

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #10813
  • Loading branch information
martinfalisse authored Jun 1, 2022
1 parent 7f359e0 commit 58cc6a1
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 2 deletions.
70 changes: 70 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6490,6 +6490,76 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
if not inplace:
return ret

def value_counts(
self,
subset=None,
normalize=False,
sort=True,
ascending=False,
dropna=True,
):
"""
Return a Series containing counts of unique rows in the DataFrame.
Parameters
----------
subset: list-like, optional
Columns to use when counting unique combinations.
normalize: bool, default False
Return proportions rather than frequencies.
sort: bool, default True
Sort by frequencies.
ascending: bool, default False
Sort in ascending order.
dropna: bool, default True
Don't include counts of rows that contain NA values.
Returns
-------
Series
Notes
-----
The returned Series will have a MultiIndex with one level per input
column. By default, rows that contain any NA values are omitted from
the result. By default, the resulting Series will be in descending
order so that the first element is the most frequently-occurring row.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6],
... 'num_wings': [2, 0, 0, 0]},
... index=['falcon', 'dog', 'cat', 'ant'])
>>> df.value_counts()
num_legs num_wings
4 0 2
2 2 1
6 0 1
dtype: int64
"""
if subset:
diff = set(subset) - set(self._data)
if len(diff) != 0:
raise KeyError(f"columns {diff} do not exist")
columns = list(self._data.names) if subset is None else subset
result = (
self.groupby(
by=columns,
dropna=dropna,
)
.size()
.astype("int64")
)
if sort:
result = result.sort_values(ascending=ascending)
if normalize:
result = result / result._column.sum()
# Pandas always returns MultiIndex even if only one column.
if not isinstance(result.index, MultiIndex):
result.index = MultiIndex._from_data(result._index._data)
return result


def from_dataframe(df, allow_copy=False):
return df_protocol.from_dataframe(df, allow_copy=allow_copy)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def size(self):
len(self.obj), "int8", masked=False
)
)
.groupby(self.grouping, sort=self._sort)
.groupby(self.grouping, sort=self._sort, dropna=self._dropna)
.agg("size")
)

Expand Down
66 changes: 65 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4269,7 +4269,7 @@ def test_df_values_property(data):
np.testing.assert_array_equal(pmtr, gmtr)


def test_value_counts():
def test_numeric_alpha_value_counts():
pdf = pd.DataFrame(
{
"numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10,
Expand Down Expand Up @@ -9356,3 +9356,67 @@ def test_dataframe_eval(df_eval, expr, dtype):
def test_dataframe_eval_errors(df_eval, expr):
with pytest.raises(ValueError):
df_eval.eval(expr)


@pytest.mark.parametrize(
"gdf,subset",
[
(
cudf.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
),
["num_legs"],
),
(
cudf.DataFrame(
{
"first_name": ["John", "Anne", "John", "Beth"],
"middle_name": ["Smith", None, None, "Louise"],
}
),
["first_name"],
),
],
)
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("use_subset", [True, False])
def test_value_counts(
gdf,
subset,
sort,
ascending,
normalize,
dropna,
use_subset,
):
pdf = gdf.to_pandas()

got = gdf.value_counts(
subset=subset if (use_subset) else None,
sort=sort,
ascending=ascending,
normalize=normalize,
dropna=dropna,
)
expected = pdf.value_counts(
subset=subset if (use_subset) else None,
sort=sort,
ascending=ascending,
normalize=normalize,
dropna=dropna,
)

if not dropna:
# Convert the Pandas series to a cuDF one due to difference
# in the handling of NaNs between the two (<NA> in cuDF and
# NaN in Pandas) when dropna=False.
assert_eq(got.sort_index(), cudf.from_pandas(expected).sort_index())
else:
assert_eq(got.sort_index(), expected.sort_index())

with pytest.raises(KeyError):
gdf.value_counts(subset=["not_a_column_name"])

0 comments on commit 58cc6a1

Please sign in to comment.