Implement .describe() for DataFrameGroupBy (#8179)

This PR implements functionality to generate summary statistics for ` Dataframe.groupby() ` operation via `.describe() ` method, similar to Pandas. ``` >>> import pandas as pd >>> pdf = pd.DataFrame({"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]}) >>> pdf Speed Score 0 380.0 50 1 370.0 30 2 24.0 90 3 26.0 80 >>> pdf.groupby('Score').describe() Speed count mean std min 25% 50% 75% max Score 30 1.0 370.0 NaN 370.0 370.0 370.0 370.0 370.0 50 1.0 380.0 NaN 380.0 380.0 380.0 380.0 380.0 80 1.0 26.0 NaN 26.0 26.0 26.0 26.0 26.0 90 1.0 24.0 NaN 24.0 24.0 24.0 24.0 24.0 >>> import cudf >>> gdf = cudf.from_pandas(pdf) >>> gdf.groupby('Score').describe() count mean std min 25% 50% 75% max Score 30 1 370.0 <NA> 370.0 370.0 370.0 370.0 370.0 50 1 380.0 <NA> 380.0 380.0 380.0 380.0 380.0 80 1 26.0 <NA> 26.0 26.0 26.0 26.0 26.0 90 1 24.0 <NA> 24.0 24.0 24.0 24.0 24.0 ``` Fixes: #7990 Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ashwin Srinath (https://github.com/shwina) - Michael Wang (https://github.com/isVoid) - Christopher Harris (https://github.com/cwharris) URL: #8179
rapidsai · Jun 7, 2021 · 92ed5b3 · 92ed5b3
1 parent ff1e849
commit 92ed5b3
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 0 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -14,6 +14,21 @@
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
+# The three functions below return the quantiles [25%, 50%, 75%]
+# respectively, which are called in the describe() method to ouput
+# the summary stats of a GroupBy object
+def _quantile_25(x):
+    return x.quantile(0.25)
+
+
+def _quantile_50(x):
+    return x.quantile(0.50)
+
+
+def _quantile_75(x):
+    return x.quantile(0.75)
+
+
 # Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
 # class after its definition (see below).
 class GroupBy(Serializable):
@@ -601,6 +616,75 @@ def func(x):
 
         return self.agg(func)
 
+    def describe(self, include=None, exclude=None):
+        """
+        Generate descriptive statistics that summarizes the central tendency,
+        dispersion and shape of a dataset’s distribution, excluding NaN values.
+
+        Analyzes numeric DataFrames only
+
+        Parameters
+        ----------
+        include: ‘all’, list-like of dtypes or None (default), optional
+            list of data types to include in the result.
+            Ignored for Series.
+
+        exclude: list-like of dtypes or None (default), optional,
+            list of data types to omit from the result.
+            Ignored for Series.
+
+        Returns
+        -------
+        Series or DataFrame
+            Summary statistics of the Dataframe provided.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({"Speed": [380.0, 370.0, 24.0, 26.0],
+                                  "Score": [50, 30, 90, 80]})
+        >>> gdf
+        Speed  Score
+        0  380.0     50
+        1  370.0     30
+        2   24.0     90
+        3   26.0     80
+        >>> gdf.groupby('Score').describe()
+            Speed
+            count   mean   std    min    25%    50%    75%     max
+        Score
+        30        1  370.0  <NA>  370.0  370.0  370.0  370.0  370.0
+        50        1  380.0  <NA>  380.0  380.0  380.0  380.0  380.0
+        80        1   26.0  <NA>   26.0   26.0   26.0   26.0   26.0
+        90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
+
+        """
+        if exclude is not None and include is not None:
+            raise NotImplementedError
+
+        res = self.agg(
+            [
+                "count",
+                "mean",
+                "std",
+                "min",
+                _quantile_25,
+                _quantile_50,
+                _quantile_75,
+                "max",
+            ]
+        )
+        res.rename(
+            columns={
+                "_quantile_25": "25%",
+                "_quantile_50": "50%",
+                "_quantile_75": "75%",
+            },
+            level=1,
+            inplace=True,
+        )
+        return res
+
     def sum(self):
         """Compute the column-wise sum of the values in each group."""
         return self.agg("sum")

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1901,3 +1901,25 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]},
+        {
+            "Speed": [380.0, 370.0, 24.0, 26.0],
+            "Score": [50, 30, 90, 80],
+            "Other": [10, 20, 30, 40],
+        },
+    ],
+)
+@pytest.mark.parametrize("group", ["Score", "Speed"])
+def test_groupby_describe(data, group):
+    pdf = pd.DataFrame(data)
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.groupby(group).describe()
+    expect = pdf.groupby(group).describe()
+
+    assert_groupby_results_equal(expect, got, check_dtype=False)