Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix warnings in test_stats.py #12293

Merged
merged 12 commits into from
Dec 5, 2022
8 changes: 3 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,15 +1188,13 @@ def _process_for_reduction(

result_col = self

# TODO: If and when pandas decides to validate that `min_count` >= 0 we
# should insert comparable behavior.
# https://github.com/pandas-dev/pandas/issues/50022
if min_count > 0:
valid_count = len(result_col) - result_col.null_count
if valid_count < min_count:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
elif min_count < 0:
warnings.warn(
f"min_count value cannot be negative({min_count}), will "
f"default to 0."
)
return result_col

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
Expand Down
62 changes: 43 additions & 19 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5861,7 +5861,49 @@ def _reduce(
for col in source._data.names
]
except AttributeError:
raise TypeError(f"Not all column dtypes support op {op}")
numeric_ops = (
"mean",
"min",
"max",
"sum",
"product",
"prod",
"std",
"var",
"kurtosis",
"kurt",
"skew",
)

if numeric_only is None and op in numeric_ops:
warnings.warn(
f"The default value of numeric_only in DataFrame.{op} "
"is deprecated. In a future version, it will default "
"to False. In addition, specifying "
"'numeric_only=None' is deprecated. Select only valid "
"columns or specify the value of numeric_only to "
"silence this warning.",
FutureWarning,
)
numeric_cols = (
name
for name in self._data.names
if is_numeric_dtype(self._data[name])
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return Series(index=cudf.StringIndex([]))
try:
result = [
getattr(source._data[col], op)(**kwargs)
for col in source._data.names
]
except AttributeError:
raise TypeError(
f"Not all column dtypes support op {op}"
)
else:
raise

return Series._from_data(
{None: result}, as_index(source._data.names)
Expand Down Expand Up @@ -5984,24 +6026,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True):

return df

@_cudf_nvtx_annotate
def kurtosis(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
):
obj = self.select_dtypes(include=[np.number, np.bool_])
return super(DataFrame, obj).kurtosis(
axis, skipna, level, numeric_only, **kwargs
)

@_cudf_nvtx_annotate
def skew(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
):
obj = self.select_dtypes(include=[np.number, np.bool_])
return super(DataFrame, obj).skew(
axis, skipna, level, numeric_only, **kwargs
)

@_cudf_nvtx_annotate
def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
obj = self.select_dtypes(include="bool") if bool_only else self
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,16 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
)


@contextmanager
def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs):
"""Catch a warning using pytest.warns if the expect_warning is True.

All arguments are forwarded to pytest.warns if expect_warning is True.
"""
if condition:
with pytest.warns(warning, *args, **kwargs):
yield
else:
yield
16 changes: 12 additions & 4 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9541,8 +9541,12 @@ def test_mean_timeseries():

assert_eq(expected, actual)

with pytest.raises(TypeError):
gdf.mean()
with pytest.warns(FutureWarning):
expected = pdf.mean()
with pytest.warns(FutureWarning):
actual = gdf.mean()

assert_eq(expected, actual)


@pytest.mark.parametrize(
Expand All @@ -9564,8 +9568,12 @@ def test_std_different_dtypes(data):

assert_eq(expected, actual)

with pytest.raises(TypeError):
gdf.std()
with pytest.warns(FutureWarning):
expected = pdf.std()
with pytest.warns(FutureWarning):
actual = gdf.std()

assert_eq(expected, actual)


@pytest.mark.parametrize(
Expand Down
56 changes: 48 additions & 8 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
_create_pandas_series,
assert_eq,
assert_exceptions_equal,
expect_warning_if,
)

params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
Expand Down Expand Up @@ -399,7 +400,13 @@ def test_cov1d(data1, data2):
ps2 = gs2.to_pandas()

got = gs1.cov(gs2)
expected = ps1.cov(ps2)
ps1_align, ps2_align = ps1.align(ps2, join="inner")
with expect_warning_if(
(len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0)
or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0),
RuntimeWarning,
):
expected = ps1.cov(ps2)
np.testing.assert_approx_equal(got, expected, significant=8)


Expand Down Expand Up @@ -442,7 +449,34 @@ def test_corr1d(data1, data2, method):
ps2 = gs2.to_pandas()

got = gs1.corr(gs2, method)
expected = ps1.corr(ps2, method)

ps1_align, ps2_align = ps1.align(ps2, join="inner")

is_singular = (
len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0
) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0)
is_identical = (
len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0
) or (
len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0
)

# Pearson correlation leads to division by 0 when either sample size is 1.
# Spearman allows for size 1 samples, but will error if all data in a
# sample is identical since the covariance is zero and so the correlation
# coefficient is not defined.
cond = (is_singular and method == "pearson") or (
is_identical and not is_singular and method == "spearman"
)
if method == "spearman":
import scipy

expected_warning = scipy.stats._warnings_errors.ConstantInputWarning
elif method == "pearson":
expected_warning = RuntimeWarning

with expect_warning_if(cond, expected_warning):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
expected = ps1.corr(ps2, method)
np.testing.assert_approx_equal(got, expected, significant=8)


Expand Down Expand Up @@ -567,14 +601,18 @@ def test_kurtosis_df(data, null_flag):
data.iloc[[0, 2]] = None
pdata.iloc[[0, 2]] = None

got = data.kurtosis()
with pytest.warns(FutureWarning):
got = data.kurtosis()
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurtosis()
with pytest.warns(FutureWarning):
expected = pdata.kurtosis()
np.testing.assert_array_almost_equal(got, expected)

got = data.kurt()
with pytest.warns(FutureWarning):
got = data.kurt()
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurt()
with pytest.warns(FutureWarning):
expected = pdata.kurt()
np.testing.assert_array_almost_equal(got, expected)

got = data.kurt(numeric_only=True)
Expand All @@ -599,8 +637,10 @@ def test_skew_df(data, null_flag):
data.iloc[[0, 2]] = None
pdata.iloc[[0, 2]] = None

got = data.skew()
expected = pdata.skew()
with pytest.warns(FutureWarning):
got = data.skew()
with pytest.warns(FutureWarning):
expected = pdata.skew()
got = got if np.isscalar(got) else got.to_numpy()
np.testing.assert_array_almost_equal(got, expected)

Expand Down