From bf978dd88d7f8eb70cfe390cd99f229d4ba14d77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 9 Nov 2022 16:42:13 +0100 Subject: [PATCH 1/5] make aggregation of empty GroupedDataFrame correct with AsTable --- NEWS.md | 5 ++ src/groupeddataframe/complextransforms.jl | 1 - src/groupeddataframe/splitapplycombine.jl | 42 +++++++++------ test/grouping.jl | 65 +++++++++++++++++++++++ 4 files changed, 97 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8a4b24f703..e4476a5efc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # DataFrames.jl v1.4.3 Patch Release Notes +## Bug fixes + +* Correctly handle `GroupedDataFrame` with no groups in multi column operation specification syntax + ([#3122](https://github.com/JuliaData/DataFrames.jl/issues/3122)) + ## Display improvements * Improve printing of grouping keys when displaying `GroupedDataFrame` diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl index 8a922d9547..e7e256cac1 100644 --- a/src/groupeddataframe/complextransforms.jl +++ b/src/groupeddataframe/complextransforms.jl @@ -28,7 +28,6 @@ function _combine_with_first((first,)::Ref{Any}, @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame} @assert f isa Base.Callable @assert incols isa Union{Nothing, AbstractVector, Tuple, NamedTuple} - @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame} extrude = false lgd = length(gd) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 54580289c4..161ec7bb25 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -486,6 +486,23 @@ function _combine_process_pair_symbol(optional_i::Bool, end end +@noinline function expand_res_astable(res, kp1, emptyres::Bool) + prepend = all(x -> x isa Integer, kp1) + if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) + throw(ArgumentError("keys of the returned elements must be " * + "`Symbol`s, strings or integers")) + end + if any(x -> !isequal(keys(x), kp1), res) + throw(ArgumentError("keys of the returned elements must be identical")) + end + outcols = [[x[n] for x in res] for n in kp1] + # make sure we only infer column names and types for empty res, but do not + # produce values that were generated when computing firstres + emptyres && foreach(empty!, outcols) + nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1] + return outcols, nms +end + # perform a transformation specified using the Pair notation with multiple output columns function _combine_process_pair_astable(optional_i::Bool, gd::GroupedDataFrame, @@ -506,19 +523,15 @@ function _combine_process_pair_astable(optional_i::Bool, firstmulticol, NOTHING_IDX_AGG, threads) @assert length(outcol_vec) == 1 res = outcol_vec[1] - @assert length(res) > 0 - - kp1 = keys(res[1]) - prepend = all(x -> x isa Integer, kp1) - if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) - throw(ArgumentError("keys of the returned elements must be " * - "`Symbol`s, strings or integers")) - end - if any(x -> !isequal(keys(x), kp1), res) - throw(ArgumentError("keys of the returned elements must be identical")) + if isempty(res) + emptyres = true + res = firstres + else + emptyres = false end - outcols = [[x[n] for x in res] for n in kp1] - nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1] + kp1 = isempty(res) ? () : kp1 = keys(res[1]) + + outcols, nms = expand_res_astable(res, kp1, emptyres) else if !firstmulticol firstres = Tables.columntable(firstres) @@ -527,9 +540,8 @@ function _combine_process_pair_astable(optional_i::Bool, end idx, outcols, nms = _combine_multicol(Ref{Any}(firstres), Ref{Any}(fun), gd, wincols, threads) - if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame, - NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) lock(gd.lazy_lock) do # if idx_agg was not computed yet it is nothing # in this case if we are not passed a vector compute it. @@ -541,8 +553,8 @@ function _combine_process_pair_astable(optional_i::Bool, idx = idx_agg[] end end - @assert length(outcols) == length(nms) end + @assert length(outcols) == length(nms) if out_col_name isa AbstractVector{Symbol} if length(out_col_name) != length(nms) throw(ArgumentError("Number of returned columns is $(length(nms)) " * diff --git a/test/grouping.jl b/test/grouping.jl index d26f09cb22..089f4257ec 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4312,4 +4312,69 @@ end @test_throws ArgumentError gdf[Not([true true true true])] end +@testset "aggregation of empty GroupedDataFrame with table output" begin + df = DataFrame(:a => Int[]) + gdf = groupby(df, :a) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x=Int[], y=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[])) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=Int[], b=Int[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=Int[], x=Int[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=Int[], x=Vector{Int}[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + + df = DataFrame(:a => [1, 2]) + gdf = groupby(df, :a)[2:1] + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x=Int[], y=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[])) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=Int[], b=Int[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=Int[], x=Int[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=Int[], x=Vector{Int}[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + + df = DataFrame(:a => [1, 2]) + gdf = groupby(df, :a) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=1:2, x=[1, 1], y=["a", "a"], b=1:2)) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=1:2, x1=[1, 1], x2=["a", "a"], b=1:2)) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=1:2, x1=['a', 'a'], x2=['b', 'b'], b=1:2)) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=1:2, b=1:2)) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=[1, 1, 2, 2], x=[1, 3, 1, 3], y=Any[2, "a", 2, "a"])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=[1, 1, 2, 2], x=[[1], [3], [1], [3]], y=Any[2, "a", 2, "a"])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) +end + end # module From f382932380debdcabeba79f613b88ffba0300e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 9 Nov 2022 18:12:50 +0100 Subject: [PATCH 2/5] improve test coverage --- test/grouping.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/grouping.jl b/test/grouping.jl index 089f4257ec..670179ff87 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4375,6 +4375,7 @@ end @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"])) @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable) end end # module From 78b386f557f9e1414234da0be3fc9474497fbd70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 9 Nov 2022 21:27:42 +0100 Subject: [PATCH 3/5] improve documentation --- docs/src/man/split_apply_combine.md | 15 +++++++++++---- src/abstractdataframe/selection.jl | 15 ++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index fc1d60fec9..e1e81c296a 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -30,15 +30,22 @@ object from your data frame using the `groupby` function that takes two argument (1) a data frame to be grouped, and (2) a set of columns to group by. Operations can then be applied on each group using one of the following functions: -* `combine`: does not put restrictions on number of rows returned, the order of rows - is specified by the order of groups in `GroupedDataFrame`; it is typically used - to compute summary statistics by group; +* `combine`: does not put restrictions on number of rows returned per group; + the returned values are vertically concatenaded following order of groups in + `GroupedDataFrame`; it is typically used to compute summary statistics by group; + for `GroupedDataFrame` if grouping columns are kept they are put as first columns + in the result; * `select`: return a data frame with the number and order of rows exactly the same as the source data frame, including only new calculated columns; `select!` is an in-place version of `select`; * `transform`: return a data frame with the number and order of rows exactly the same as the source data frame, including all columns from the source and new calculated columns; - `transform!` is an in-place version of `transform`. + `transform!` is an in-place version of `transform`; + existing columns in the source data frame are put as first columns in the result; + +As a special case, if a `GroupedDataFrame` that has zero groups is passed then +the result of the operation is determined by performing a single call to the +transformation function with a 0-row argument passed to it. All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 0d1363fe89..56745c8b97 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -37,19 +37,24 @@ const TRANSFORMATION_COMMON_RULES = (1) a data frame to be grouped, and (2) a set of columns to group by. Operations can then be applied on each group using one of the following functions: - * `combine`: does not put restrictions on number of rows returned, the order of rows - is specified by the order of groups in `GroupedDataFrame`; it is typically used - to compute summary statistics by group; for `GroupedDataFrame` if grouping columns - are kept they are put as first columns in the result; + * `combine`: does not put restrictions on number of rows returned per group; + the returned values are vertically concatenaded following order of groups in + `GroupedDataFrame`; it is typically used to compute summary statistics by group; + for `GroupedDataFrame` if grouping columns are kept they are put as first columns + in the result; * `select`: return a data frame with the number and order of rows exactly the same as the source data frame, including only new calculated columns; `select!` is an in-place version of `select`; for `GroupedDataFrame` if grouping columns are kept they are put as first columns in the result; * `transform`: return a data frame with the number and order of rows exactly the same as the source data frame, including all columns from the source and new calculated columns; - `transform!` is an in-place version of `transform`; for `GroupedDataFrame` + `transform!` is an in-place version of `transform`; existing columns in the source data frame are put as first columns in the result; + As a special case, if a `GroupedDataFrame` that has zero groups is passed then + the result of the operation is determined by performing a single call to the + transformation function with a 0-row argument passed to it. + All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: 1. standard column selectors (integers, `Symbol`s, strings, vectors of integers, From c567e10aaf4993c2336a77d646974c07e7e5a710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 12 Nov 2022 14:15:40 +0100 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/groupeddataframe/splitapplycombine.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 161ec7bb25..b627b7ba5d 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -493,7 +493,7 @@ end "`Symbol`s, strings or integers")) end if any(x -> !isequal(keys(x), kp1), res) - throw(ArgumentError("keys of the returned elements must be identical")) + throw(ArgumentError("keys of the returned elements must be equal")) end outcols = [[x[n] for x in res] for n in kp1] # make sure we only infer column names and types for empty res, but do not @@ -529,7 +529,7 @@ function _combine_process_pair_astable(optional_i::Bool, else emptyres = false end - kp1 = isempty(res) ? () : kp1 = keys(res[1]) + kp1 = isempty(res) ? () : keys(res[1]) outcols, nms = expand_res_astable(res, kp1, emptyres) else From a6ba2da48ad51ea3384089b8a87bec7207aa8573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 12 Nov 2022 14:21:43 +0100 Subject: [PATCH 5/5] improve docstring --- docs/src/man/split_apply_combine.md | 4 +++- src/abstractdataframe/selection.jl | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index e1e81c296a..5e34a2e175 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -45,7 +45,9 @@ Operations can then be applied on each group using one of the following function As a special case, if a `GroupedDataFrame` that has zero groups is passed then the result of the operation is determined by performing a single call to the -transformation function with a 0-row argument passed to it. +transformation function with a 0-row argument passed to it. The output of this +operation is only used to identify the number and type of produced columns, but +the result has zero rows. All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 56745c8b97..9e32989c68 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -53,7 +53,9 @@ const TRANSFORMATION_COMMON_RULES = As a special case, if a `GroupedDataFrame` that has zero groups is passed then the result of the operation is determined by performing a single call to the - transformation function with a 0-row argument passed to it. + transformation function with a 0-row argument passed to it. The output of this + operation is only used to identify the number and type of produced columns, but + the result has zero rows. All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: