Skip to content

Commit

Permalink
Add GroupBy.dtypes (#12783)
Browse files Browse the repository at this point in the history
This PR adds `dtypes` property to `GroupBy`, this will also fix some upstream dask breaking changes introduced in: dask/dask#9889

Issue was discovered in: #12768 (comment)

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: #12783
  • Loading branch information
galipremsagar authored Feb 16, 2023
1 parent 12410d9 commit d787ff2
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 3 deletions.
31 changes: 31 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,37 @@ def __iter__(self):
for i, name in enumerate(group_names):
yield name, grouped_values[offsets[i] : offsets[i + 1]]

@property
def dtypes(self):
"""
Return the dtypes in this group.
Returns
-------
pandas.DataFrame
The data type of each column of the group.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [1, 2, 3, 3], 'b': ['x', 'y', 'z', 'a'],
... 'c':[10, 11, 12, 12]})
>>> df.groupby("a").dtypes
b c
a
1 object int64
2 object int64
3 object int64
"""
index = self.grouping.keys.unique().to_pandas()
return pd.DataFrame(
{
name: [self.obj._dtypes[name]] * len(index)
for name in self.grouping.values._column_names
},
index=index,
)

@cached_property
def groups(self):
"""
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2960,3 +2960,15 @@ def test_groupby_ngroup(by, ascending, df_ngroup):
expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending)
actual = df_ngroup.groupby(by).ngroup(ascending=ascending)
assert_eq(expected, actual, check_dtype=False)


@pytest.mark.parametrize(
"groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
)
def test_groupby_dtypes(groups):
df = cudf.DataFrame(
{"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]}
)
pdf = df.to_pandas()

assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
12 changes: 9 additions & 3 deletions python/cudf_kafka/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
),
)
CUDF_KAFKA_ROOT = os.environ.get(
"CUDF_KAFKA_ROOT", "../../libcudf_kafka/build"
"CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
)

try:
Expand Down Expand Up @@ -72,8 +72,14 @@
pa.get_include(),
cuda_include_dir,
],
library_dirs=([get_python_lib(), os.path.join(os.sys.prefix, "lib")]),
libraries=["cudf", "cudf_kafka"],
library_dirs=(
[
get_python_lib(),
os.path.join(os.sys.prefix, "lib"),
CUDF_KAFKA_ROOT,
]
),
libraries=["cudf", "cudf_kafka", "fmt"],
language="c++",
extra_compile_args=["-std=c++17"],
)
Expand Down

0 comments on commit d787ff2

Please sign in to comment.