From 620511fccb13f3a3ebc43e80265f81433a23474e Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 7 Dec 2021 19:08:30 -0800
Subject: [PATCH 1/4] initial

---
 python/cudf/cudf/core/series.py       | 5 ++++-
 python/cudf/cudf/tests/test_series.py | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 3aae79af4e8..fc349a1ffb3 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3370,7 +3370,10 @@ def _describe_categorical(self):
             data = [self.count(), self.unique().size]
 
             if data[1] > 0:
-                top, freq = val_counts.index[0], val_counts.iloc[0]
+                # In case there's a tie, break the tie by sorting the index
+                # and take the top.
+                tied_val_counts = val_counts[val_counts == val_counts[0]].sort_index()
+                top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0]
                 data += [str(top), freq]
             # If the DataFrame is empty, set 'top' and 'freq' to None
             # to maintain output shape consistency
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index d59e3ba7571..9647efe0803 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -446,6 +446,9 @@ def test_series_describe_timedelta(dtype):
         pd.Series([True, False, True, True, False]),
         pd.Series([], dtype="str"),
         pd.Series(["a", "b", "c", "a"], dtype="category"),
+        pd.Series(["d", "e", "f"], dtype="category"),
+        pd.Series(["e", "d", "f"], dtype="category"),
+        pd.Series(["f", "e", "d"], dtype="category"),
     ],
 )
 def test_series_describe_other_types(ps):

From ebcf7f86c761556dd229ac9bc46cbf642e63046e Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Wed, 8 Dec 2021 10:13:22 -0800
Subject: [PATCH 2/4] style

---
 python/cudf/cudf/core/series.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index fc349a1ffb3..ca1b6c03d49 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3372,7 +3372,9 @@ def _describe_categorical(self):
             if data[1] > 0:
                 # In case there's a tie, break the tie by sorting the index
                 # and take the top.
-                tied_val_counts = val_counts[val_counts == val_counts[0]].sort_index()
+                tied_val_counts = val_counts[
+                    val_counts == val_counts[0]
+                ].sort_index()
                 top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0]
                 data += [str(top), freq]
             # If the DataFrame is empty, set 'top' and 'freq' to None

From f23933b6cdbbbf9277bc9d27cbd7f0dfbf1dd43a Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Thu, 9 Dec 2021 20:55:16 -0800
Subject: [PATCH 3/4] use dict and enable test

---
 python/cudf/cudf/core/series.py       | 141 +++++++++++++-------------
 python/cudf/cudf/tests/test_series.py |  38 +++----
 2 files changed, 88 insertions(+), 91 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ca1b6c03d49..cab60a2e2cd 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles):
             return ["{0}%".format(int(x * 100)) for x in percentiles]
 
         def _format_stats_values(stats_data):
-            return list(map(lambda x: round(x, 6), stats_data))
+            return map(lambda x: round(x, 6), stats_data)
 
         def _describe_numeric(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-            data = (
-                [self.count(), self.mean(), self.std(), self.min()]
-                + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
-                + [self.max()]
-            )
-            data = _format_stats_values(data)
+            data = {
+                "count": self.count(),
+                "mean": self.mean(),
+                "std": self.std(),
+                "min": self.min(),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": self.max(),
+            }
 
             return Series(
-                data=data, index=index, nan_as_null=False, name=self.name,
+                data=_format_stats_values(data.values()),
+                index=data.keys(),
+                nan_as_null=False,
+                name=self.name,
             )
 
         def _describe_timedelta(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean()),
-                    str(self.std()),
-                    str(pd.Timedelta(self.min())),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timedelta(self.max()))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(self.mean()),
+                "std": str(self.std()),
+                "min": pd.Timedelta(self.min()),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": str(pd.Timedelta(self.max())),
+            }
 
             return Series(
-                data=data,
-                index=index,
+                data=data.values(),
+                index=data.keys(),
                 dtype="str",
                 nan_as_null=False,
                 name=self.name,
@@ -3365,56 +3369,55 @@ def _describe_timedelta(self):
         def _describe_categorical(self):
             # blocked by StringColumn/DatetimeColumn support for
             # value_counts/unique
-            index = ["count", "unique", "top", "freq"]
-            val_counts = self.value_counts(ascending=False)
-            data = [self.count(), self.unique().size]
-
-            if data[1] > 0:
+            data = {
+                "count": self.count(),
+                "unique": len(self.unique()),
+                "top": None,
+                "freq": None,
+            }
+            if data["count"] > 0:
                 # In case there's a tie, break the tie by sorting the index
                 # and take the top.
+                val_counts = self.value_counts(ascending=False)
                 tied_val_counts = val_counts[
-                    val_counts == val_counts[0]
+                    val_counts == val_counts.iloc[0]
                 ].sort_index()
-                top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0]
-                data += [str(top), freq]
-            # If the DataFrame is empty, set 'top' and 'freq' to None
-            # to maintain output shape consistency
-            else:
-                data += [None, None]
+                data.update(
+                    {
+                        "top": tied_val_counts.index[0],
+                        "freq": tied_val_counts.iloc[0],
+                    }
+                )
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )
 
         def _describe_timestamp(self):
-
-            index = (
-                ["count", "mean", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean().to_numpy().astype("datetime64[ns]")),
-                    str(pd.Timestamp(self.min().astype("datetime64[ns]"))),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(pd.Timestamp(self.mean())),
+                "min": str(pd.Timestamp(self.min())),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype(self.dtype)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan),
+                    )
+                ),
+                "max": str(pd.Timestamp((self.max()))),
+            }
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 9647efe0803..478116184db 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -12,7 +12,6 @@
 
 import cudf
 from cudf.testing._utils import (
-    DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
     assert_eq,
@@ -402,30 +401,21 @@ def test_series_describe_numeric(dtype):
     assert_eq(expected, actual)
 
 
-@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6219")
-@pytest.mark.parametrize("dtype", DATETIME_TYPES)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
 def test_series_describe_datetime(dtype):
+    # Note that other datetime units are not tested because pandas does not
+    # support them. When specified coarser units, cuDF datetime columns cannot
+    # represent fractional time for quantiles of the column, which may require
+    # interpolation, this differs from pandas which always stay in [ns] unit.
     gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype)
     ps = gs.to_pandas()
 
-    pdf_results = ps.describe(datetime_is_numeric=True)
-    gdf_results = gs.describe()
-
-    # Assert count
-    p_count = pdf_results["count"]
-    g_count = gdf_results["count"]
-
-    assert_eq(int(g_count), p_count)
-
-    # Assert Index
-    assert_eq(gdf_results.index, pdf_results.index)
+    # Treating datetimes as categoricals is deprecated in pandas and will
+    # be removed in future. Future behavior is treating datetime as numeric.
+    expected = ps.describe(datetime_is_numeric=True)
+    actual = gs.describe()
 
-    # Assert rest of the element apart from
-    # the first index('count')
-    actual = gdf_results.tail(-1).astype("datetime64[ns]")
-    expected = pdf_results.tail(-1).astype("str").astype("datetime64[ns]")
-
-    assert_eq(expected, actual)
+    assert_eq(expected.astype("str"), actual)
 
 
 @pytest.mark.parametrize("dtype", TIMEDELTA_TYPES)
@@ -447,8 +437,12 @@ def test_series_describe_timedelta(dtype):
         pd.Series([], dtype="str"),
         pd.Series(["a", "b", "c", "a"], dtype="category"),
         pd.Series(["d", "e", "f"], dtype="category"),
-        pd.Series(["e", "d", "f"], dtype="category"),
-        pd.Series(["f", "e", "d"], dtype="category"),
+        pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])),
+        pd.Series(
+            pd.Categorical(
+                ["d", "e", "f"], categories=["f", "e", "d"], ordered=True
+            )
+        ),
     ],
 )
 def test_series_describe_other_types(ps):

From 1136a80afdfe9d268739dcfb763926526a73bd0e Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 13 Dec 2021 15:14:03 -0800
Subject: [PATCH 4/4] Update python/cudf/cudf/core/series.py

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/series.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index cab60a2e2cd..51be60833e6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3345,7 +3345,7 @@ def _describe_timedelta(self):
                 "count": str(self.count()),
                 "mean": str(self.mean()),
                 "std": str(self.std()),
-                "min": pd.Timedelta(self.min()),
+                "min": str(pd.Timedelta(self.min())),
                 **dict(
                     zip(
                         _format_percentile_names(percentiles),