Skip to content

Commit

Permalink
Fix result representation in groupby.apply (#6790)
Browse files Browse the repository at this point in the history
Fixes: #6745, #6788

This PR fixes the result representation of groupby apply function when the custom udf returns a scalar, Series or a DataFrame.
  • Loading branch information
galipremsagar authored Nov 24, 2020
1 parent db066de commit cdd72c9
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@
- PR #6632 Fix DataFrame initialization from list of dicts
- PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest
- PR #6787 Update java reduction APIs to reflect C++ changes
- PR #6790 Fix result representation in groupby.apply
- PR #6794 Fix AVRO reader issues with empty input
- PR #6798 Fix `read_avro` docs
- PR #6824 Fix JNI build
Expand Down
18 changes: 17 additions & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,23 @@ def mult(df):
chunks = [
grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
]
result = cudf.concat([function(chk) for chk in chunks])
chunk_results = [function(chk) for chk in chunks]

if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar(
chunk_results[0]
):
result = cudf.Series(
chunk_results, index=self.grouping.keys[offsets[:-1]]
)
result.index.names = self.grouping.names
elif len(chunk_results) > 0 and isinstance(
chunk_results[0], cudf.Series
):
result = cudf.concat(chunk_results, axis=1).T
result.index.names = self.grouping.names
else:
result = cudf.concat(chunk_results)

if self._sort:
result = result.sort_index()
return result
Expand Down
48 changes: 48 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,3 +1271,51 @@ def test_groupby_pipe():
actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min())

assert_eq(expected, actual)


def test_groupby_apply_return_scalars():
pdf = pd.DataFrame(
{
"A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
"B": [
0.01,
np.nan,
0.03,
0.04,
np.nan,
0.06,
0.07,
0.08,
0.09,
1.0,
],
}
)
gdf = cudf.from_pandas(pdf)

def custom_map_func(x):
x = x[~x["B"].isna()]
ticker = x.shape[0]
full = ticker / 10
return full

expected = pdf.groupby("A").apply(lambda x: custom_map_func(x))
actual = gdf.groupby("A").apply(lambda x: custom_map_func(x))

assert_eq(expected, actual)


@pytest.mark.parametrize(
"cust_func",
[lambda x: x - x.max(), lambda x: x.min() - x.max(), lambda x: x.min()],
)
def test_groupby_apply_return_series_dataframe(cust_func):
pdf = pd.DataFrame(
{"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
)
gdf = cudf.from_pandas(pdf)

expected = pdf.groupby(["key"]).apply(cust_func)
actual = gdf.groupby(["key"]).apply(cust_func)

assert_eq(expected, actual)

0 comments on commit cdd72c9

Please sign in to comment.