Skip to content

Commit

Permalink
make aggregation of empty GroupedDataFrame correct with AsTable
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Nov 9, 2022
1 parent a1dc899 commit 25adf7b
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 16 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# DataFrames.jl v1.4.3 Patch Release Notes

## Bug fixes

* Correctly handle `GroupedDataFrame` with no groups in multi column operation specification syntax
([#3122](https://github.com/JuliaData/DataFrames.jl/issues/3122))

## Display improvements

* Improve printing of grouping keys when displaying `GroupedDataFrame`
Expand Down
1 change: 0 additions & 1 deletion src/groupeddataframe/complextransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ function _combine_with_first((first,)::Ref{Any},
@assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
@assert f isa Base.Callable
@assert incols isa Union{Nothing, AbstractVector, Tuple, NamedTuple}
@assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
extrude = false

lgd = length(gd)
Expand Down
42 changes: 27 additions & 15 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,23 @@ function _combine_process_pair_symbol(optional_i::Bool,
end
end

@noinline function expand_res_astable(res, kp1, emptyres::Bool)
prepend = all(x -> x isa Integer, kp1)
if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
throw(ArgumentError("keys of the returned elements must be " *
"`Symbol`s, strings or integers"))
end
if any(x -> !isequal(keys(x), kp1), res)
throw(ArgumentError("keys of the returned elements must be identical"))
end
outcols = [[x[n] for x in res] for n in kp1]
# make sure we only infer column names and types for empty res, but do not
# produce values that were generated when computing firstres
emptyres && foreach(empty!, outcols)
nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
return outcols, nms
end

# perform a transformation specified using the Pair notation with multiple output columns
function _combine_process_pair_astable(optional_i::Bool,
gd::GroupedDataFrame,
Expand All @@ -506,19 +523,15 @@ function _combine_process_pair_astable(optional_i::Bool,
firstmulticol, NOTHING_IDX_AGG, threads)
@assert length(outcol_vec) == 1
res = outcol_vec[1]
@assert length(res) > 0

kp1 = keys(res[1])
prepend = all(x -> x isa Integer, kp1)
if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
throw(ArgumentError("keys of the returned elements must be " *
"`Symbol`s, strings or integers"))
end
if any(x -> !isequal(keys(x), kp1), res)
throw(ArgumentError("keys of the returned elements must be identical"))
if isempty(res)
emptyres = true
res = firstres
else
emptyres = false
end
outcols = [[x[n] for x in res] for n in kp1]
nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
kp1 = isempty(res) ? () : kp1 = keys(res[1])

outcols, nms = expand_res_astable(res, kp1, emptyres)
else
if !firstmulticol
firstres = Tables.columntable(firstres)
Expand All @@ -527,9 +540,8 @@ function _combine_process_pair_astable(optional_i::Bool,
end
idx, outcols, nms = _combine_multicol(Ref{Any}(firstres), Ref{Any}(fun), gd,
wincols, threads)

if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
lock(gd.lazy_lock) do
# if idx_agg was not computed yet it is nothing
# in this case if we are not passed a vector compute it.
Expand All @@ -541,8 +553,8 @@ function _combine_process_pair_astable(optional_i::Bool,
idx = idx_agg[]
end
end
@assert length(outcols) == length(nms)
end
@assert length(outcols) == length(nms)
if out_col_name isa AbstractVector{Symbol}
if length(out_col_name) != length(nms)
throw(ArgumentError("Number of returned columns is $(length(nms)) " *
Expand Down
65 changes: 65 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4312,4 +4312,69 @@ end
@test_throws ArgumentError gdf[Not([true true true true])]
end

@testset "aggregation of empty GroupedDataFrame with table output" begin
df = DataFrame(:a => Int[])
gdf = groupby(df, :a)
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
@test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
@test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
# test below errors because keys for strings do not support == comparison
@test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
@test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
DataFrame(a=Int[], b=Int[]))
@test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
DataFrame(a=Int[], x=Int[], y=Any[]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
@test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])

df = DataFrame(:a => [1, 2])
gdf = groupby(df, :a)[2:1]
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
@test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
@test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
# test below errors because keys for strings do not support == comparison
@test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
@test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
DataFrame(a=Int[], b=Int[]))
@test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
DataFrame(a=Int[], x=Int[], y=Any[]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
@test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])

df = DataFrame(:a => [1, 2])
gdf = groupby(df, :a)
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
DataFrame(a=1:2, x=[1, 1], y=["a", "a"], b=1:2))
@test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
DataFrame(a=1:2, x1=[1, 1], x2=["a", "a"], b=1:2))
@test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
DataFrame(a=1:2, x1=['a', 'a'], x2=['b', 'b'], b=1:2))
# test below errors because keys for strings do not support == comparison
@test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
@test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
DataFrame(a=1:2, b=1:2))
@test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
@test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
DataFrame(a=[1, 1, 2, 2], x=[1, 3, 1, 3], y=Any[2, "a", 2, "a"]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
DataFrame(a=[1, 1, 2, 2], x=[[1], [3], [1], [3]], y=Any[2, "a", 2, "a"]))
@test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"]))
@test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
end

end # module

0 comments on commit 25adf7b

Please sign in to comment.