rapidsai · rapids-bot · Jun 7, 2021 · May 7, 2021 · May 11, 2021 · May 11, 2021
@@ -14,6 +14,21 @@
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
+# The three functions below return the quantiles [25%, 50%, 75%]
+# respectively, which are called in the describe() method to ouput
+# the summary stats of a GroupBy object
+def _quantile_25(x):
+    return x.quantile(0.25)
+
+
+def _quantile_50(x):
+    return x.quantile(0.50)
+
+
+def _quantile_75(x):
+    return x.quantile(0.75)
+
+
 # Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
 # class after its definition (see below).
 class GroupBy(Serializable):
@@ -601,6 +616,75 @@ def func(x):
 
         return self.agg(func)
 
+    def describe(self, include=None, exclude=None):
+        """
+        Generate descriptive statistics that summarizes the central tendency,
+        dispersion and shape of a dataset’s distribution, excluding NaN values.
+
+        Analyzes numeric DataFrames only
+
+        Parameters
+        ----------
+        include: ‘all’, list-like of dtypes or None (default), optional
+            list of data types to include in the result.
+            Ignored for Series.
+
+        exclude: list-like of dtypes or None (default), optional,
+            list of data types to omit from the result.
+            Ignored for Series.
+
+        Returns
+        -------
+        Series or DataFrame
+            Summary statistics of the Dataframe provided.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({"Speed": [380.0, 370.0, 24.0, 26.0],
+                                  "Score": [50, 30, 90, 80]})
+        >>> gdf
+        Speed  Score
+        0  380.0     50
+        1  370.0     30
+        2   24.0     90
+        3   26.0     80
+        >>> gdf.groupby('Score').describe()
+            Speed
+            count   mean   std    min    25%    50%    75%     max
+        Score
+        30        1  370.0  <NA>  370.0  370.0  370.0  370.0  370.0
+        50        1  380.0  <NA>  380.0  380.0  380.0  380.0  380.0
+        80        1   26.0  <NA>   26.0   26.0   26.0   26.0   26.0
+        90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
+
+        """
+        if exclude is not None and include is not None:
+            raise NotImplementedError
+
+        res = self.agg(
+            [
+                "count",
+                "mean",
+                "std",
+                "min",
+                _quantile_25,
+                _quantile_50,
+                _quantile_75,
+                "max",
+            ]
+        )
+        res.rename(
+            columns={
+                "_quantile_25": "25%",
+                "_quantile_50": "50%",
+                "_quantile_75": "75%",
+            },
+            level=1,
+            inplace=True,
+        )
+        return res
+
     def sum(self):
         """Compute the column-wise sum of the values in each group."""
         return self.agg("sum")

@@ -1901,3 +1901,25 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]},
+        {
+            "Speed": [380.0, 370.0, 24.0, 26.0],
+            "Score": [50, 30, 90, 80],
+            "Other": [10, 20, 30, 40],
+        },
+    ],
+)
+@pytest.mark.parametrize("group", ["Score", "Speed"])
+def test_groupby_describe(data, group):
+    pdf = pd.DataFrame(data)
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.groupby(group).describe()
+    expect = pdf.groupby(group).describe()
+
+    assert_groupby_results_equal(expect, got, check_dtype=False)