From bf978dd88d7f8eb70cfe390cd99f229d4ba14d77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 9 Nov 2022 16:42:13 +0100
Subject: [PATCH 1/5] make aggregation of empty GroupedDataFrame correct with
 AsTable

---
 NEWS.md                                   |  5 ++
 src/groupeddataframe/complextransforms.jl |  1 -
 src/groupeddataframe/splitapplycombine.jl | 42 +++++++++------
 test/grouping.jl                          | 65 +++++++++++++++++++++++
 4 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 8a4b24f703..e4476a5efc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,10 @@
 # DataFrames.jl v1.4.3 Patch Release Notes
 
+## Bug fixes
+
+* Correctly handle `GroupedDataFrame` with no groups in multi column operation specification syntax
+  ([#3122](https://github.com/JuliaData/DataFrames.jl/issues/3122))
+
 ## Display improvements
 
 * Improve printing of grouping keys when displaying `GroupedDataFrame`
diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl
index 8a922d9547..e7e256cac1 100644
--- a/src/groupeddataframe/complextransforms.jl
+++ b/src/groupeddataframe/complextransforms.jl
@@ -28,7 +28,6 @@ function _combine_with_first((first,)::Ref{Any},
     @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
     @assert f isa Base.Callable
     @assert incols isa Union{Nothing, AbstractVector, Tuple, NamedTuple}
-    @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
     extrude = false
 
     lgd = length(gd)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 54580289c4..161ec7bb25 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -486,6 +486,23 @@ function _combine_process_pair_symbol(optional_i::Bool,
     end
 end
 
+@noinline function expand_res_astable(res, kp1, emptyres::Bool)
+    prepend = all(x -> x isa Integer, kp1)
+    if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
+        throw(ArgumentError("keys of the returned elements must be " *
+                            "`Symbol`s, strings or integers"))
+    end
+    if any(x -> !isequal(keys(x), kp1), res)
+        throw(ArgumentError("keys of the returned elements must be identical"))
+    end
+    outcols = [[x[n] for x in res] for n in kp1]
+    # make sure we only infer column names and types for empty res, but do not
+    # produce values that were generated when computing firstres
+    emptyres && foreach(empty!, outcols)
+    nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
+    return outcols, nms
+end
+
 # perform a transformation specified using the Pair notation with multiple output columns
 function _combine_process_pair_astable(optional_i::Bool,
                                        gd::GroupedDataFrame,
@@ -506,19 +523,15 @@ function _combine_process_pair_astable(optional_i::Bool,
                                                  firstmulticol, NOTHING_IDX_AGG, threads)
         @assert length(outcol_vec) == 1
         res = outcol_vec[1]
-        @assert length(res) > 0
-
-        kp1 = keys(res[1])
-        prepend = all(x -> x isa Integer, kp1)
-        if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
-            throw(ArgumentError("keys of the returned elements must be " *
-                                "`Symbol`s, strings or integers"))
-        end
-        if any(x -> !isequal(keys(x), kp1), res)
-            throw(ArgumentError("keys of the returned elements must be identical"))
+        if isempty(res)
+            emptyres = true
+            res = firstres
+        else
+            emptyres = false
         end
-        outcols = [[x[n] for x in res] for n in kp1]
-        nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
+        kp1 = isempty(res) ? () : kp1 = keys(res[1])
+
+        outcols, nms = expand_res_astable(res, kp1, emptyres)
     else
         if !firstmulticol
             firstres = Tables.columntable(firstres)
@@ -527,9 +540,8 @@ function _combine_process_pair_astable(optional_i::Bool,
         end
         idx, outcols, nms = _combine_multicol(Ref{Any}(firstres), Ref{Any}(fun), gd,
                                               wincols, threads)
-
         if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame,
-            NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
+             NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
             lock(gd.lazy_lock) do
                 # if idx_agg was not computed yet it is nothing
                 # in this case if we are not passed a vector compute it.
@@ -541,8 +553,8 @@ function _combine_process_pair_astable(optional_i::Bool,
                 idx = idx_agg[]
             end
         end
-        @assert length(outcols) == length(nms)
     end
+    @assert length(outcols) == length(nms)
     if out_col_name isa AbstractVector{Symbol}
         if length(out_col_name) != length(nms)
             throw(ArgumentError("Number of returned columns is $(length(nms)) " *
diff --git a/test/grouping.jl b/test/grouping.jl
index d26f09cb22..089f4257ec 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -4312,4 +4312,69 @@ end
     @test_throws ArgumentError gdf[Not([true true true true])]
 end
 
+@testset "aggregation of empty GroupedDataFrame with table output" begin
+    df = DataFrame(:a => Int[])
+    gdf = groupby(df, :a)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=Int[], b=Int[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Int[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+
+    df = DataFrame(:a => [1, 2])
+    gdf = groupby(df, :a)[2:1]
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=Int[], b=Int[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Int[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+
+    df = DataFrame(:a => [1, 2])
+    gdf = groupby(df, :a)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x=[1, 1], y=["a", "a"], b=1:2))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x1=[1, 1], x2=["a", "a"], b=1:2))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x1=['a', 'a'], x2=['b', 'b'], b=1:2))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=1:2, b=1:2))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=[1, 1, 2, 2], x=[1, 3, 1, 3], y=Any[2, "a", 2, "a"]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=[1, 1, 2, 2], x=[[1], [3], [1], [3]], y=Any[2, "a", 2, "a"]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+end
+
 end # module

From f382932380debdcabeba79f613b88ffba0300e48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 9 Nov 2022 18:12:50 +0100
Subject: [PATCH 2/5] improve test coverage

---
 test/grouping.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/grouping.jl b/test/grouping.jl
index 089f4257ec..670179ff87 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -4375,6 +4375,7 @@ end
     @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
                         DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"]))
     @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+    @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable)
 end
 
 end # module

From 78b386f557f9e1414234da0be3fc9474497fbd70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 9 Nov 2022 21:27:42 +0100
Subject: [PATCH 3/5] improve documentation

---
 docs/src/man/split_apply_combine.md | 15 +++++++++++----
 src/abstractdataframe/selection.jl  | 15 ++++++++++-----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index fc1d60fec9..e1e81c296a 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -30,15 +30,22 @@ object from your data frame using the `groupby` function that takes two argument
 (1) a data frame to be grouped, and (2) a set of columns to group by.
 
 Operations can then be applied on each group using one of the following functions:
-* `combine`: does not put restrictions on number of rows returned, the order of rows
-  is specified by the order of groups in `GroupedDataFrame`; it is typically used
-  to compute summary statistics by group;
+* `combine`: does not put restrictions on number of rows returned per group;
+  the returned values are vertically concatenaded following order of groups in
+  `GroupedDataFrame`; it is typically used to compute summary statistics by group;
+  for `GroupedDataFrame` if grouping columns are kept they are put as first columns
+  in the result;
 * `select`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including only new calculated columns;
   `select!` is an in-place version of `select`;
 * `transform`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including all columns from the source and new calculated columns;
-  `transform!` is an in-place version of `transform`.
+  `transform!` is an in-place version of `transform`;
+  existing columns in the source data frame are put as first columns in the result;
+
+As a special case, if a `GroupedDataFrame` that has zero groups is passed then
+the result of the operation is determined by performing a single call to the
+transformation function with a 0-row argument passed to it.
 
 All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 0d1363fe89..56745c8b97 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -37,19 +37,24 @@ const TRANSFORMATION_COMMON_RULES =
     (1) a data frame to be grouped, and (2) a set of columns to group by.
 
     Operations can then be applied on each group using one of the following functions:
-    * `combine`: does not put restrictions on number of rows returned, the order of rows
-      is specified by the order of groups in `GroupedDataFrame`; it is typically used
-      to compute summary statistics by group; for `GroupedDataFrame` if grouping columns
-      are kept they are put as first columns in the result;
+    * `combine`: does not put restrictions on number of rows returned per group;
+      the returned values are vertically concatenaded following order of groups in
+      `GroupedDataFrame`; it is typically used to compute summary statistics by group;
+      for `GroupedDataFrame` if grouping columns are kept they are put as first columns
+      in the result;
     * `select`: return a data frame with the number and order of rows exactly the same
       as the source data frame, including only new calculated columns;
       `select!` is an in-place version of `select`; for `GroupedDataFrame` if grouping columns
       are kept they are put as first columns in the result;
     * `transform`: return a data frame with the number and order of rows exactly the same
       as the source data frame, including all columns from the source and new calculated columns;
-      `transform!` is an in-place version of `transform`; for `GroupedDataFrame`
+      `transform!` is an in-place version of `transform`;
       existing columns in the source data frame are put as first columns in the result;
 
+    As a special case, if a `GroupedDataFrame` that has zero groups is passed then
+    the result of the operation is determined by performing a single call to the
+    transformation function with a 0-row argument passed to it.
+
     All these functions take a specification of one or more functions to apply to
     each subset of the `DataFrame`. This specification can be of the following forms:
     1. standard column selectors (integers, `Symbol`s, strings, vectors of integers,

From c567e10aaf4993c2336a77d646974c07e7e5a710 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 12 Nov 2022 14:15:40 +0100
Subject: [PATCH 4/5] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/groupeddataframe/splitapplycombine.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 161ec7bb25..b627b7ba5d 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -493,7 +493,7 @@ end
                             "`Symbol`s, strings or integers"))
     end
     if any(x -> !isequal(keys(x), kp1), res)
-        throw(ArgumentError("keys of the returned elements must be identical"))
+        throw(ArgumentError("keys of the returned elements must be equal"))
     end
     outcols = [[x[n] for x in res] for n in kp1]
     # make sure we only infer column names and types for empty res, but do not
@@ -529,7 +529,7 @@ function _combine_process_pair_astable(optional_i::Bool,
         else
             emptyres = false
         end
-        kp1 = isempty(res) ? () : kp1 = keys(res[1])
+        kp1 = isempty(res) ? () : keys(res[1])
 
         outcols, nms = expand_res_astable(res, kp1, emptyres)
     else

From a6ba2da48ad51ea3384089b8a87bec7207aa8573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 12 Nov 2022 14:21:43 +0100
Subject: [PATCH 5/5] improve docstring

---
 docs/src/man/split_apply_combine.md | 4 +++-
 src/abstractdataframe/selection.jl  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index e1e81c296a..5e34a2e175 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -45,7 +45,9 @@ Operations can then be applied on each group using one of the following function
 
 As a special case, if a `GroupedDataFrame` that has zero groups is passed then
 the result of the operation is determined by performing a single call to the
-transformation function with a 0-row argument passed to it.
+transformation function with a 0-row argument passed to it. The output of this
+operation is only used to identify the number and type of produced columns, but
+the result has zero rows.
 
 All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 56745c8b97..9e32989c68 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -53,7 +53,9 @@ const TRANSFORMATION_COMMON_RULES =
 
     As a special case, if a `GroupedDataFrame` that has zero groups is passed then
     the result of the operation is determined by performing a single call to the
-    transformation function with a 0-row argument passed to it.
+    transformation function with a 0-row argument passed to it. The output of this
+    operation is only used to identify the number and type of produced columns, but
+    the result has zero rows.
 
     All these functions take a specification of one or more functions to apply to
     each subset of the `DataFrame`. This specification can be of the following forms: