Skip to content

Commit

Permalink
Add support for numeric_only in DataFrame._reduce (#10629)
Browse files Browse the repository at this point in the history
Add support for numeric_only in DataFrame._reduce, this way can use df.mean(numeric_only=True), etc. Resolves #2067. Also partially addresses #9009.

Authors:
  - https://github.com/martinfalisse

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10629
  • Loading branch information
martinfalisse authored Apr 14, 2022
1 parent ac27757 commit f7c35d5
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 21 deletions.
25 changes: 16 additions & 9 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5180,26 +5180,33 @@ def _reduce(
if level is not None:
raise NotImplementedError("level parameter is not implemented yet")

if numeric_only not in (None, True):
raise NotImplementedError(
"numeric_only parameter is not implemented yet"
source = self
if numeric_only:
numeric_cols = (
name
for name in self._data.names
if is_numeric_dtype(self._data[name])
)
axis = self._get_axis_from_axis_arg(axis)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return Series(index=cudf.StringIndex([]))

axis = source._get_axis_from_axis_arg(axis)

if axis == 0:
try:
result = [
getattr(self._data[col], op)(**kwargs)
for col in self._data.names
getattr(source._data[col], op)(**kwargs)
for col in source._data.names
]
except AttributeError:
raise TypeError(f"cannot perform {op} with type {self.dtype}")
raise TypeError(f"Not all column dtypes support op {op}")

return Series._from_data(
{None: result}, as_index(self._data.names)
{None: result}, as_index(source._data.names)
)
elif axis == 1:
return self._apply_cupy_method_axis_1(op, **kwargs)
return source._apply_cupy_method_axis_1(op, **kwargs)

@_cudf_nvtx_annotate
def _scan(
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def _reduce(
if level is not None:
raise NotImplementedError("level parameter is not implemented yet")

if numeric_only not in (None, True):
if numeric_only:
raise NotImplementedError(
"numeric_only parameter is not implemented yet"
f"Series.{op} does not implement numeric_only"
)
try:
return getattr(self._column, op)(**kwargs)
Expand Down
54 changes: 54 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9230,3 +9230,57 @@ def test_dataframe_pct_change(data, periods, fill_method):
expected = pdf.pct_change(periods=periods, fill_method=fill_method)

assert_eq(expected, actual)


def test_mean_timeseries():
gdf = cudf.datasets.timeseries()
pdf = gdf.to_pandas()

expected = pdf.mean(numeric_only=True)
actual = gdf.mean(numeric_only=True)

assert_eq(expected, actual)

with pytest.raises(TypeError):
gdf.mean()


@pytest.mark.parametrize(
"data",
[
{
"a": [1, 2, 3, 4, 5],
"b": ["a", "b", "c", "d", "e"],
"c": [1.0, 2.0, 3.0, 4.0, 5.0],
}
],
)
def test_std_different_dtypes(data):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

expected = pdf.std(numeric_only=True)
actual = gdf.std(numeric_only=True)

assert_eq(expected, actual)

with pytest.raises(TypeError):
gdf.std()


@pytest.mark.parametrize(
"data",
[
{
"id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
"val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
}
],
)
def test_empty_numeric_only(data):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()
expected = pdf.prod(numeric_only=True)
actual = gdf.prod(numeric_only=True)
assert_eq(expected, actual)
83 changes: 73 additions & 10 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,10 @@ def test_misc_quantiles(data, q):
cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
cudf.Series([]),
cudf.Series([-3]),
randomdata(
nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
),
],
)
@pytest.mark.parametrize("null_flag", [False, True])
def test_kurtosis(data, null_flag):
def test_kurtosis_series(data, null_flag):
pdata = data.to_pandas()

if null_flag and len(data) > 2:
Expand All @@ -262,8 +259,13 @@ def test_kurtosis(data, null_flag):
expected = pdata.kurt()
np.testing.assert_array_almost_equal(got, expected)

got = data.kurt(numeric_only=False)
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurt(numeric_only=False)
np.testing.assert_array_almost_equal(got, expected)

with pytest.raises(NotImplementedError):
data.kurt(numeric_only=False)
data.kurt(numeric_only=True)


@pytest.mark.parametrize(
Expand All @@ -280,13 +282,10 @@ def test_kurtosis(data, null_flag):
cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
cudf.Series([]),
cudf.Series([-3]),
randomdata(
nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
),
],
)
@pytest.mark.parametrize("null_flag", [False, True])
def test_skew(data, null_flag):
def test_skew_series(data, null_flag):
pdata = data.to_pandas()

if null_flag and len(data) > 2:
Expand All @@ -298,8 +297,13 @@ def test_skew(data, null_flag):
got = got if np.isscalar(got) else got.to_numpy()
np.testing.assert_array_almost_equal(got, expected)

got = data.skew(numeric_only=False)
expected = pdata.skew(numeric_only=False)
got = got if np.isscalar(got) else got.to_numpy()
np.testing.assert_array_almost_equal(got, expected)

with pytest.raises(NotImplementedError):
data.skew(numeric_only=False)
data.skew(numeric_only=True)


@pytest.mark.parametrize("dtype", params_dtypes)
Expand Down Expand Up @@ -541,3 +545,62 @@ def test_cov_corr_invalid_dtypes(gsr):
rfunc_args_and_kwargs=([gsr],),
compare_error_message=False,
)


@pytest.mark.parametrize(
"data",
[
randomdata(
nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
),
],
)
@pytest.mark.parametrize("null_flag", [False, True])
def test_kurtosis_df(data, null_flag):
pdata = data.to_pandas()

if null_flag and len(data) > 2:
data.iloc[[0, 2]] = None
pdata.iloc[[0, 2]] = None

got = data.kurtosis()
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurtosis()
np.testing.assert_array_almost_equal(got, expected)

got = data.kurt()
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurt()
np.testing.assert_array_almost_equal(got, expected)

got = data.kurt(numeric_only=True)
got = got if np.isscalar(got) else got.to_numpy()
expected = pdata.kurt(numeric_only=True)
np.testing.assert_array_almost_equal(got, expected)


@pytest.mark.parametrize(
"data",
[
randomdata(
nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
),
],
)
@pytest.mark.parametrize("null_flag", [False, True])
def test_skew_df(data, null_flag):
pdata = data.to_pandas()

if null_flag and len(data) > 2:
data.iloc[[0, 2]] = None
pdata.iloc[[0, 2]] = None

got = data.skew()
expected = pdata.skew()
got = got if np.isscalar(got) else got.to_numpy()
np.testing.assert_array_almost_equal(got, expected)

got = data.skew(numeric_only=True)
expected = pdata.skew(numeric_only=True)
got = got if np.isscalar(got) else got.to_numpy()
np.testing.assert_array_almost_equal(got, expected)

0 comments on commit f7c35d5

Please sign in to comment.