Add GroupBy.dtypes (#12783)

This PR adds `dtypes` property to `GroupBy`, this will also fix some upstream dask breaking changes introduced in: dask/dask#9889 Issue was discovered in: #12768 (comment) Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) URL: #12783
rapidsai · Feb 16, 2023 · d787ff2 · d787ff2
1 parent 12410d9
commit d787ff2
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 3 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -275,6 +275,37 @@ def __iter__(self):
         for i, name in enumerate(group_names):
             yield name, grouped_values[offsets[i] : offsets[i + 1]]
 
+    @property
+    def dtypes(self):
+        """
+        Return the dtypes in this group.
+
+        Returns
+        -------
+        pandas.DataFrame
+            The data type of each column of the group.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 3], 'b': ['x', 'y', 'z', 'a'],
+        ...                      'c':[10, 11, 12, 12]})
+        >>> df.groupby("a").dtypes
+                b      c
+        a
+        1  object  int64
+        2  object  int64
+        3  object  int64
+        """
+        index = self.grouping.keys.unique().to_pandas()
+        return pd.DataFrame(
+            {
+                name: [self.obj._dtypes[name]] * len(index)
+                for name in self.grouping.values._column_names
+            },
+            index=index,
+        )
+
     @cached_property
     def groups(self):
         """

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -2960,3 +2960,15 @@ def test_groupby_ngroup(by, ascending, df_ngroup):
     expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending)
     actual = df_ngroup.groupby(by).ngroup(ascending=ascending)
     assert_eq(expected, actual, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
+)
+def test_groupby_dtypes(groups):
+    df = cudf.DataFrame(
+        {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]}
+    )
+    pdf = df.to_pandas()
+
+    assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
@@ -43,7 +43,7 @@
     ),
 )
 CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../libcudf_kafka/build"
+    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
 )
 
 try:
@@ -72,8 +72,14 @@
             pa.get_include(),
             cuda_include_dir,
         ],
-        library_dirs=([get_python_lib(), os.path.join(os.sys.prefix, "lib")]),
-        libraries=["cudf", "cudf_kafka"],
+        library_dirs=(
+            [
+                get_python_lib(),
+                os.path.join(os.sys.prefix, "lib"),
+                CUDF_KAFKA_ROOT,
+            ]
+        ),
+        libraries=["cudf", "cudf_kafka", "fmt"],
         language="c++",
         extra_compile_args=["-std=c++17"],
     )