Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Groupby pct_change #11144

Merged
merged 21 commits into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1bed7e5
create new pr
skirui-source Mar 16, 2022
942c808
Merge branch 'branch-22.04' of https://github.com/rapidsai/cudf into …
skirui-source Mar 22, 2022
5b204c1
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
skirui-source Mar 23, 2022
1187d8e
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
skirui-source Mar 31, 2022
1a4f866
added pct_change for groupby object
skirui-source Apr 1, 2022
2b1ff89
Merge branch 'groupby.pct_change' of github.com:skirui-source/cudf in…
skirui-source May 2, 2022
d40495a
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 24, 2022
fb4b522
added tests wip: creating replacement pr
skirui-source Jun 24, 2022
3dfaee6
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 28, 2022
9a941db
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 28, 2022
709d3cb
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jun 29, 2022
2173886
Merge branch 'groupby.pct_change' of github.com:skirui-source/cudf in…
skirui-source Jul 7, 2022
d9090d3
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 7, 2022
970f9c3
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 8, 2022
14c65c0
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 12, 2022
4131641
perform fillna on dataframe rather than groupby object itself
skirui-source Jul 12, 2022
d76071b
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 13, 2022
110d0ac
mimick pandas implem. of pct_change, xfail multiindex test
skirui-source Jul 13, 2022
7f68522
added link to created issue for solving fillna() mismatch behavior
skirui-source Jul 13, 2022
ee05ae1
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 13, 2022
584dbe2
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
skirui-source Jul 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,48 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
result = self._mimic_pandas_order(result)
return result._copy_type_metadata(values)

def pct_change(
self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None
):
"""
Calculates the percent change between sequential elements
in the group.

Parameters
----------
periods : int, default 1
Periods to shift for forming percent change.
fill_method : str, default 'ffill'
How to handle NAs before computing percent changes.
limit : int, optional
The number of consecutive NAs to fill before stopping.
Not yet implemented.
freq : str, optional
Increment to use from time series API.
Not yet implemented.

Returns
-------
Series or DataFrame
Percentage changes within each group
"""
if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")
if limit is not None:
raise NotImplementedError("limit parameter not supported yet.")
if freq is not None:
raise NotImplementedError("freq parameter not supported yet.")
elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
raise ValueError(
"fill_method must be one of 'ffill', 'pad', "
"'bfill', or 'backfill'."
)

filled = self.fillna(method=fill_method, limit=limit)
fill_grp = filled.groupby(self.grouping)
shifted = fill_grp.shift(periods=periods, freq=freq)
return (filled / shifted) - 1

def _mimic_pandas_order(
self, result: DataFrameOrSeries
) -> DataFrameOrSeries:
Expand Down
73 changes: 73 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2604,3 +2604,76 @@ def test_groupby_transform_maintain_index(by):
assert_groupby_results_equal(
pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max")
)


@pytest.mark.parametrize(
"data, gkey",
[
(
{
"id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
"val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
"val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
},
["id"],
),
(
{
"id": [0, 0, 0, 0, 1, 1, 1],
"a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0],
"b": [10.0, 23, -4.0, 2, -3.0, None, 19.0],
},
["id", "a"],
),
(
{
"id": ["a", "a", "b", "b", "c", "c"],
"val1": [None, None, None, None, None, None],
},
["id"],
),
],
)
@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
def test_groupby_pct_change(data, gkey, periods, fill_method):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

actual = gdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)
expected = pdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)

assert_eq(expected, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11259")
@pytest.mark.parametrize("periods", [-5, 5])
def test_groupby_pct_change_multiindex_dataframe(periods):
gdf = cudf.DataFrame(
{
"a": [1, 1, 2, 2],
"b": [1, 1, 2, 3],
"c": [2, 3, 4, 5],
"d": [6, 8, 9, 1],
}
).set_index(["a", "b"])

actual = gdf.groupby(level=["a", "b"]).pct_change(periods)
expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods)

assert_eq(expected, actual)


def test_groupby_pct_change_empty_columns():
gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
pdf = gdf.to_pandas()

actual = gdf.groupby("id").pct_change()
expected = pdf.groupby("id").pct_change()

assert_eq(expected, actual)