rapidsai · rapids-bot · Apr 22, 2021 · Mar 24, 2021 · Mar 25, 2021 · Mar 25, 2021
@@ -282,3 +282,38 @@ cdef class _AggregationFactory:
             libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
         ))
         return agg
+
+    # scan aggregations
+    # TODO: update this after adding per algorithm aggregation derived types
+    # https://github.com/rapidsai/cudf/issues/7106
+    @classmethod
+    def cumsum(cls):
+        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        agg.c_obj = move(libcudf_aggregation.make_sum_aggregation())
+        return agg
+
+    @classmethod
+    def cummin(cls):
+        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        agg.c_obj = move(libcudf_aggregation.make_min_aggregation())
+        return agg
+
+    @classmethod
+    def cummax(cls):
+        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        agg.c_obj = move(libcudf_aggregation.make_max_aggregation())
+        return agg
+
+    @classmethod
+    def cumcount(cls, dropna=False):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
+            c_null_handling
+        ))
+        return agg
@@ -65,5 +65,12 @@ cdef extern from "cudf/groupby.hpp" \
             const vector[aggregation_request]& requests,
         ) except +
 
+        pair[
+            unique_ptr[table],
+            vector[aggregation_result]
+        ] scan(
+            const vector[aggregation_request]& requests,
+        ) except +
+
         groups get_groups() except +
         groups get_groups(table_view values) except +
@@ -24,11 +24,19 @@ cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. The uppercased versions of these strings correspond to
 # elements of the AggregationKind enum.
+_GROUPBY_SCANS = {
+    "cumcount",
+    "cumsum",
+    "cummin",
+    "cummax",
+}
+
 _CATEGORICAL_AGGS = {
     "count",
     "size",
     "nunique",
     "unique",
+    "cumcount",
 }
 
 _STRING_AGGS = {
@@ -40,6 +48,7 @@ _STRING_AGGS = {
     "nth",
     "collect",
     "unique",
+    "cumcount",
 }
 
 _LIST_AGGS = {
@@ -135,6 +144,7 @@ cdef class GroupBy:
         cdef Column col
 
         aggregations = _drop_unsupported_aggs(values, aggregations)
+        cdef bool scan = _is_all_scan_aggregate(aggregations)
 
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
@@ -154,11 +164,18 @@ cdef class GroupBy:
 
         try:
             with nogil:
-                c_result = move(
-                    self.c_obj.get()[0].aggregate(
-                        c_agg_requests
+                if scan:
+                    c_result = move(
+                        self.c_obj.get()[0].scan(
+                            c_agg_requests
+                        )
+                    )
+                else:
+                    c_result = move(
+                        self.c_obj.get()[0].aggregate(
+                            c_agg_requests
+                        )
                     )
-                )
         except RuntimeError as e:
             # TODO: remove this try..except after
             # https://github.com/rapidsai/cudf/issues/7611
@@ -254,3 +271,35 @@ def _drop_unsupported_aggs(Table values, aggs):
         raise DataError("No numeric types to aggregate")
 
     return result
+
+
+def _is_all_scan_aggregate(aggs):
+    """
+    Returns true if all are scan aggregations.
+    Raises
+    ------
+    NotImplementedError
+        If both reduction aggregations and scan aggregations are present.
+    """
+
+    def get_name(agg):
+        return agg.__name__ if callable(agg) else agg
+
+    all_scan = all(
+        all(
+            get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name]
+        )
+        for col_name in aggs
+    )
+    any_scan = any(
+        any(
+            get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name]
+        )
+        for col_name in aggs
+    )
+
+    if not all_scan and any_scan:
+        raise NotImplementedError(
+            "Cannot perform both aggregation and scan in one operation"
+        )
+    return all_scan and any_scan
@@ -588,6 +588,10 @@ def rolling(self, *args, **kwargs):
     "nunique",
     "collect",
     "unique",
+    "cumcount",
+    "cumsum",
+    "cummin",
+    "cummax",
 }
 
 

@@ -27,7 +27,7 @@
 _tomorrow = _now + np.timedelta64(1, "D")
 _now = np.int64(_now.astype("datetime64[ns]"))
 _tomorrow = np.int64(_tomorrow.astype("datetime64[ns]"))
-_index_type_aggs = {"count", "idxmin", "idxmax"}
+_index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"}
 
 
 def assert_groupby_results_equal(expect, got, sort=True, **kwargs):
@@ -1588,3 +1588,34 @@ def test_groupby_unique(by, data, dtype):
     expect = pdf.groupby("by")["data"].unique()
     got = gdf.groupby("by")["data"].unique()
     assert_groupby_results_equal(expect, got)
+
+
+@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
+@pytest.mark.parametrize("func", ["cummin", "cummax", "cumcount", "cumsum"])
+def test_groupby_2keys_scan(nelem, func):
+    pdf = make_frame(pd.DataFrame, nelem=nelem)
+    expect_df = pdf.groupby(["x", "y"], sort=True).agg(func)
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
+    )
+    # pd.groupby.cumcount returns a series.
+    if isinstance(expect_df, pd.Series):
+        expect_df = expect_df.to_frame("val")
+    expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()
+
+    check_dtype = False if func in _index_type_aggs else True
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
+
+
+def test_groupby_mix_agg_scan():
+    err_msg = "Cannot perform both aggregation and scan in one operation"
+    func = ["cumsum", "sum"]
+    gb = make_frame(DataFrame, nelem=10).groupby(["x", "y"], sort=True)
+
+    gb.agg(func[0])
+    gb.agg(func[1])
+    gb.agg(func[1:])
+    with pytest.raises(NotImplementedError, match=err_msg):
+        gb.agg(func)