From 620511fccb13f3a3ebc43e80265f81433a23474e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 19:08:30 -0800 Subject: [PATCH 1/4] initial --- python/cudf/cudf/core/series.py | 5 ++++- python/cudf/cudf/tests/test_series.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3aae79af4e8..fc349a1ffb3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3370,7 +3370,10 @@ def _describe_categorical(self): data = [self.count(), self.unique().size] if data[1] > 0: - top, freq = val_counts.index[0], val_counts.iloc[0] + # In case there's a tie, break the tie by sorting the index + # and take the top. + tied_val_counts = val_counts[val_counts == val_counts[0]].sort_index() + top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0] data += [str(top), freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d59e3ba7571..9647efe0803 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -446,6 +446,9 @@ def test_series_describe_timedelta(dtype): pd.Series([True, False, True, True, False]), pd.Series([], dtype="str"), pd.Series(["a", "b", "c", "a"], dtype="category"), + pd.Series(["d", "e", "f"], dtype="category"), + pd.Series(["e", "d", "f"], dtype="category"), + pd.Series(["f", "e", "d"], dtype="category"), ], ) def test_series_describe_other_types(ps): From ebcf7f86c761556dd229ac9bc46cbf642e63046e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 8 Dec 2021 10:13:22 -0800 Subject: [PATCH 2/4] style --- python/cudf/cudf/core/series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fc349a1ffb3..ca1b6c03d49 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3372,7 +3372,9 @@ def _describe_categorical(self): if data[1] > 0: # In case there's a tie, break the tie by sorting the index # and take the top. - tied_val_counts = val_counts[val_counts == val_counts[0]].sort_index() + tied_val_counts = val_counts[ + val_counts == val_counts[0] + ].sort_index() top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0] data += [str(top), freq] # If the DataFrame is empty, set 'top' and 'freq' to None From f23933b6cdbbbf9277bc9d27cbd7f0dfbf1dd43a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 9 Dec 2021 20:55:16 -0800 Subject: [PATCH 3/4] use dict and enable test --- python/cudf/cudf/core/series.py | 141 +++++++++++++------------- python/cudf/cudf/tests/test_series.py | 38 +++---- 2 files changed, 88 insertions(+), 91 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ca1b6c03d49..cab60a2e2cd 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles): return ["{0}%".format(int(x * 100)) for x in percentiles] def _format_stats_values(stats_data): - return list(map(lambda x: round(x, 6), stats_data)) + return map(lambda x: round(x, 6), stats_data) def _describe_numeric(self): # mimicking pandas - index = ( - ["count", "mean", "std", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - data = ( - [self.count(), self.mean(), self.std(), self.min()] - + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist() - + [self.max()] - ) - data = _format_stats_values(data) + data = { + "count": self.count(), + "mean": self.mean(), + "std": self.std(), + "min": self.min(), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .to_numpy(na_value=np.nan) + .tolist(), + ) + ), + "max": self.max(), + } return Series( - data=data, index=index, nan_as_null=False, name=self.name, + data=_format_stats_values(data.values()), + index=data.keys(), + nan_as_null=False, + name=self.name, ) def _describe_timedelta(self): # mimicking pandas - index = ( - ["count", "mean", "std", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - - data = ( - [ - str(self.count()), - str(self.mean()), - str(self.std()), - str(pd.Timedelta(self.min())), - ] - + self.quantile(percentiles) - .astype("str") - .to_numpy(na_value=None) - .tolist() - + [str(pd.Timedelta(self.max()))] - ) + data = { + "count": str(self.count()), + "mean": str(self.mean()), + "std": str(self.std()), + "min": pd.Timedelta(self.min()), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .astype("str") + .to_numpy(na_value=np.nan) + .tolist(), + ) + ), + "max": str(pd.Timedelta(self.max())), + } return Series( - data=data, - index=index, + data=data.values(), + index=data.keys(), dtype="str", nan_as_null=False, name=self.name, @@ -3365,56 +3369,55 @@ def _describe_timedelta(self): def _describe_categorical(self): # blocked by StringColumn/DatetimeColumn support for # value_counts/unique - index = ["count", "unique", "top", "freq"] - val_counts = self.value_counts(ascending=False) - data = [self.count(), self.unique().size] - - if data[1] > 0: + data = { + "count": self.count(), + "unique": len(self.unique()), + "top": None, + "freq": None, + } + if data["count"] > 0: # In case there's a tie, break the tie by sorting the index # and take the top. + val_counts = self.value_counts(ascending=False) tied_val_counts = val_counts[ - val_counts == val_counts[0] + val_counts == val_counts.iloc[0] ].sort_index() - top, freq = tied_val_counts.index[0], tied_val_counts.iloc[0] - data += [str(top), freq] - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - data += [None, None] + data.update( + { + "top": tied_val_counts.index[0], + "freq": tied_val_counts.iloc[0], + } + ) return Series( - data=data, + data=data.values(), dtype="str", - index=index, + index=data.keys(), nan_as_null=False, name=self.name, ) def _describe_timestamp(self): - - index = ( - ["count", "mean", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - - data = ( - [ - str(self.count()), - str(self.mean().to_numpy().astype("datetime64[ns]")), - str(pd.Timestamp(self.min().astype("datetime64[ns]"))), - ] - + self.quantile(percentiles) - .astype("str") - .to_numpy(na_value=None) - .tolist() - + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))] - ) + data = { + "count": str(self.count()), + "mean": str(pd.Timestamp(self.mean())), + "min": str(pd.Timestamp(self.min())), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .astype(self.dtype) + .astype("str") + .to_numpy(na_value=np.nan), + ) + ), + "max": str(pd.Timestamp((self.max()))), + } return Series( - data=data, + data=data.values(), dtype="str", - index=index, + index=data.keys(), nan_as_null=False, name=self.name, ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 9647efe0803..478116184db 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -12,7 +12,6 @@ import cudf from cudf.testing._utils import ( - DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, assert_eq, @@ -402,30 +401,21 @@ def test_series_describe_numeric(dtype): assert_eq(expected, actual) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6219") -@pytest.mark.parametrize("dtype", DATETIME_TYPES) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) def test_series_describe_datetime(dtype): + # Note that other datetime units are not tested because pandas does not + # support them. When specified coarser units, cuDF datetime columns cannot + # represent fractional time for quantiles of the column, which may require + # interpolation, this differs from pandas which always stay in [ns] unit. gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) ps = gs.to_pandas() - pdf_results = ps.describe(datetime_is_numeric=True) - gdf_results = gs.describe() - - # Assert count - p_count = pdf_results["count"] - g_count = gdf_results["count"] - - assert_eq(int(g_count), p_count) - - # Assert Index - assert_eq(gdf_results.index, pdf_results.index) + # Treating datetimes as categoricals is deprecated in pandas and will + # be removed in future. Future behavior is treating datetime as numeric. + expected = ps.describe(datetime_is_numeric=True) + actual = gs.describe() - # Assert rest of the element apart from - # the first index('count') - actual = gdf_results.tail(-1).astype("datetime64[ns]") - expected = pdf_results.tail(-1).astype("str").astype("datetime64[ns]") - - assert_eq(expected, actual) + assert_eq(expected.astype("str"), actual) @pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) @@ -447,8 +437,12 @@ def test_series_describe_timedelta(dtype): pd.Series([], dtype="str"), pd.Series(["a", "b", "c", "a"], dtype="category"), pd.Series(["d", "e", "f"], dtype="category"), - pd.Series(["e", "d", "f"], dtype="category"), - pd.Series(["f", "e", "d"], dtype="category"), + pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), + pd.Series( + pd.Categorical( + ["d", "e", "f"], categories=["f", "e", "d"], ordered=True + ) + ), ], ) def test_series_describe_other_types(ps): From 1136a80afdfe9d268739dcfb763926526a73bd0e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 13 Dec 2021 15:14:03 -0800 Subject: [PATCH 4/4] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cab60a2e2cd..51be60833e6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3345,7 +3345,7 @@ def _describe_timedelta(self): "count": str(self.count()), "mean": str(self.mean()), "std": str(self.std()), - "min": pd.Timedelta(self.min()), + "min": str(pd.Timedelta(self.min())), **dict( zip( _format_percentile_names(percentiles),