Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Break tie for top categorical columns in Series.describe #9867

Merged
merged 4 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 76 additions & 68 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles):
return ["{0}%".format(int(x * 100)) for x in percentiles]

def _format_stats_values(stats_data):
return list(map(lambda x: round(x, 6), stats_data))
return map(lambda x: round(x, 6), stats_data)

def _describe_numeric(self):
# mimicking pandas
index = (
["count", "mean", "std", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)
data = (
[self.count(), self.mean(), self.std(), self.min()]
+ self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
+ [self.max()]
)
data = _format_stats_values(data)
data = {
"count": self.count(),
"mean": self.mean(),
"std": self.std(),
"min": self.min(),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.to_numpy(na_value=np.nan)
.tolist(),
)
),
"max": self.max(),
}

return Series(
data=data, index=index, nan_as_null=False, name=self.name,
data=_format_stats_values(data.values()),
index=data.keys(),
nan_as_null=False,
name=self.name,
)

def _describe_timedelta(self):
# mimicking pandas
index = (
["count", "mean", "std", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)

data = (
[
str(self.count()),
str(self.mean()),
str(self.std()),
str(pd.Timedelta(self.min())),
]
+ self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=None)
.tolist()
+ [str(pd.Timedelta(self.max()))]
)
data = {
"count": str(self.count()),
"mean": str(self.mean()),
"std": str(self.std()),
"min": str(pd.Timedelta(self.min())),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=np.nan)
.tolist(),
)
),
"max": str(pd.Timedelta(self.max())),
}

return Series(
data=data,
index=index,
data=data.values(),
index=data.keys(),
dtype="str",
nan_as_null=False,
name=self.name,
Expand All @@ -3365,51 +3369,55 @@ def _describe_timedelta(self):
def _describe_categorical(self):
# blocked by StringColumn/DatetimeColumn support for
# value_counts/unique
index = ["count", "unique", "top", "freq"]
val_counts = self.value_counts(ascending=False)
data = [self.count(), self.unique().size]

if data[1] > 0:
top, freq = val_counts.index[0], val_counts.iloc[0]
data += [str(top), freq]
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
data += [None, None]
data = {
"count": self.count(),
"unique": len(self.unique()),
"top": None,
"freq": None,
}
if data["count"] > 0:
# In case there's a tie, break the tie by sorting the index
# and take the top.
val_counts = self.value_counts(ascending=False)
tied_val_counts = val_counts[
val_counts == val_counts.iloc[0]
].sort_index()
data.update(
{
"top": tied_val_counts.index[0],
"freq": tied_val_counts.iloc[0],
}
)

return Series(
data=data,
data=data.values(),
dtype="str",
index=index,
index=data.keys(),
nan_as_null=False,
name=self.name,
)

def _describe_timestamp(self):
Copy link
Contributor

@bdice bdice Dec 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The describe implementations appear to be casting all the values to str (aside from the numeric implementation). This does not align with Pandas behavior:

>>> s = pd.Series([
...   np.datetime64("2000-01-01"),
...   np.datetime64("2010-01-01"),
...   np.datetime64("2010-01-01"),
... ])
>>> print(type(s.describe()["top"]))
<class 'pandas._libs.tslibs.timestamps.Timestamp'>

I recognize there is an issue here with different types, namely that count and freq are not of the same type as mean, min, percentiles, or max.

Some options to resolve this (and their downsides):

  1. Current implementation: return all values as str (results are on GPU ...but data is not usable as str type).
  2. Return a pd.DataFrame or dict that can have multiple types (not a GPU DataFrame).

Personally I think option (2) is the better choice here. The summary doesn't really need to be a GPU DataFrame since it contains so few values. Do we have precedent for this kind of behavior returning a CPU (Pandas) DataFrame?

I'm content to resolve this with the current implementation as well. Just wanted to note the inconsistency. I can file an issue if needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that a copy to host that preserves more type information is probably preferable. I'm not aware of any API that returns a pandas DataFrame, but we do return pandas objects in some places. One example is DataFrame.columns, which returns a pandas Index, probably based on the same consideration of not wanting to put the data on device. Personally from an API design perspective I would prefer not to return pandas objects and would rather return something like a dict, but I can see that causing more friction for users switching between pandas and cudf.

IMO this change is out of scope for this PR in any case. We should open an issue to discuss this further and make a more holistic discussion, but we shouldn't hold up progress here.


index = (
["count", "mean", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)

data = (
[
str(self.count()),
str(self.mean().to_numpy().astype("datetime64[ns]")),
str(pd.Timestamp(self.min().astype("datetime64[ns]"))),
]
+ self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=None)
.tolist()
+ [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
)
data = {
"count": str(self.count()),
"mean": str(pd.Timestamp(self.mean())),
"min": str(pd.Timestamp(self.min())),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.astype(self.dtype)
.astype("str")
.to_numpy(na_value=np.nan),
)
),
"max": str(pd.Timestamp((self.max()))),
}

return Series(
data=data,
data=data.values(),
dtype="str",
index=index,
index=data.keys(),
nan_as_null=False,
name=self.name,
)
Expand Down
37 changes: 17 additions & 20 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import cudf
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
TIMEDELTA_TYPES,
assert_eq,
Expand Down Expand Up @@ -402,30 +401,21 @@ def test_series_describe_numeric(dtype):
assert_eq(expected, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6219")
@pytest.mark.parametrize("dtype", DATETIME_TYPES)
@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
def test_series_describe_datetime(dtype):
# Note that other datetime units are not tested because pandas does not
# support them. When specified coarser units, cuDF datetime columns cannot
# represent fractional time for quantiles of the column, which may require
# interpolation, this differs from pandas which always stay in [ns] unit.
gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype)
ps = gs.to_pandas()

pdf_results = ps.describe(datetime_is_numeric=True)
gdf_results = gs.describe()

# Assert count
p_count = pdf_results["count"]
g_count = gdf_results["count"]

assert_eq(int(g_count), p_count)

# Assert Index
assert_eq(gdf_results.index, pdf_results.index)
# Treating datetimes as categoricals is deprecated in pandas and will
# be removed in future. Future behavior is treating datetime as numeric.
expected = ps.describe(datetime_is_numeric=True)
actual = gs.describe()

# Assert rest of the element apart from
# the first index('count')
actual = gdf_results.tail(-1).astype("datetime64[ns]")
expected = pdf_results.tail(-1).astype("str").astype("datetime64[ns]")

assert_eq(expected, actual)
assert_eq(expected.astype("str"), actual)


@pytest.mark.parametrize("dtype", TIMEDELTA_TYPES)
Expand All @@ -446,6 +436,13 @@ def test_series_describe_timedelta(dtype):
pd.Series([True, False, True, True, False]),
pd.Series([], dtype="str"),
pd.Series(["a", "b", "c", "a"], dtype="category"),
pd.Series(["d", "e", "f"], dtype="category"),
pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])),
pd.Series(
pd.Categorical(
["d", "e", "f"], categories=["f", "e", "d"], ordered=True
)
),
],
)
def test_series_describe_other_types(ps):
Expand Down