Implement value_counts for DataFrame (#10813)

Add functionality for value_counts() in DataFrame. Resolves #5169 Authors: - https://github.com/martinfalisse Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #10813
rapidsai · Jun 1, 2022 · 58cc6a1 · 58cc6a1
1 parent 7f359e0
commit 58cc6a1
Show file tree

Hide file tree

Showing 3 changed files with 136 additions and 2 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -6490,6 +6490,76 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         if not inplace:
             return ret
 
+    def value_counts(
+        self,
+        subset=None,
+        normalize=False,
+        sort=True,
+        ascending=False,
+        dropna=True,
+    ):
+        """
+        Return a Series containing counts of unique rows in the DataFrame.
+
+        Parameters
+        ----------
+        subset: list-like, optional
+            Columns to use when counting unique combinations.
+        normalize: bool, default False
+            Return proportions rather than frequencies.
+        sort: bool, default True
+            Sort by frequencies.
+        ascending: bool, default False
+            Sort in ascending order.
+        dropna: bool, default True
+            Don't include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        The returned Series will have a MultiIndex with one level per input
+        column. By default, rows that contain any NA values are omitted from
+        the result. By default, the resulting Series will be in descending
+        order so that the first element is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6],
+        ...                    'num_wings': [2, 0, 0, 0]},
+        ...                    index=['falcon', 'dog', 'cat', 'ant'])
+        >>> df.value_counts()
+        num_legs  num_wings
+        4         0            2
+        2         2            1
+        6         0            1
+        dtype: int64
+        """
+        if subset:
+            diff = set(subset) - set(self._data)
+            if len(diff) != 0:
+                raise KeyError(f"columns {diff} do not exist")
+        columns = list(self._data.names) if subset is None else subset
+        result = (
+            self.groupby(
+                by=columns,
+                dropna=dropna,
+            )
+            .size()
+            .astype("int64")
+        )
+        if sort:
+            result = result.sort_values(ascending=ascending)
+        if normalize:
+            result = result / result._column.sum()
+        # Pandas always returns MultiIndex even if only one column.
+        if not isinstance(result.index, MultiIndex):
+            result.index = MultiIndex._from_data(result._index._data)
+        return result
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -188,7 +188,7 @@ def size(self):
                     len(self.obj), "int8", masked=False
                 )
             )
-            .groupby(self.grouping, sort=self._sort)
+            .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
 

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -4269,7 +4269,7 @@ def test_df_values_property(data):
     np.testing.assert_array_equal(pmtr, gmtr)
 
 
-def test_value_counts():
+def test_numeric_alpha_value_counts():
     pdf = pd.DataFrame(
         {
             "numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10,
@@ -9356,3 +9356,67 @@ def test_dataframe_eval(df_eval, expr, dtype):
 def test_dataframe_eval_errors(df_eval, expr):
     with pytest.raises(ValueError):
         df_eval.eval(expr)
+
+
+@pytest.mark.parametrize(
+    "gdf,subset",
+    [
+        (
+            cudf.DataFrame(
+                {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+                index=["falcon", "dog", "cat", "ant"],
+            ),
+            ["num_legs"],
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "first_name": ["John", "Anne", "John", "Beth"],
+                    "middle_name": ["Smith", None, None, "Louise"],
+                }
+            ),
+            ["first_name"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("use_subset", [True, False])
+def test_value_counts(
+    gdf,
+    subset,
+    sort,
+    ascending,
+    normalize,
+    dropna,
+    use_subset,
+):
+    pdf = gdf.to_pandas()
+
+    got = gdf.value_counts(
+        subset=subset if (use_subset) else None,
+        sort=sort,
+        ascending=ascending,
+        normalize=normalize,
+        dropna=dropna,
+    )
+    expected = pdf.value_counts(
+        subset=subset if (use_subset) else None,
+        sort=sort,
+        ascending=ascending,
+        normalize=normalize,
+        dropna=dropna,
+    )
+
+    if not dropna:
+        # Convert the Pandas series to a cuDF one due to difference
+        # in the handling of NaNs between the two (<NA> in cuDF and
+        # NaN in Pandas) when dropna=False.
+        assert_eq(got.sort_index(), cudf.from_pandas(expected).sort_index())
+    else:
+        assert_eq(got.sort_index(), expected.sort_index())
+
+    with pytest.raises(KeyError):
+        gdf.value_counts(subset=["not_a_column_name"])