Skip to content

Commit

Permalink
Allow tables with no columns in combine to drop groups
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jan 2, 2020
1 parent 9c8de96 commit f9a0f7c
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 32 deletions.
86 changes: 61 additions & 25 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives the same additional columns
and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.
`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down Expand Up @@ -340,6 +342,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
additional columns and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.
`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down Expand Up @@ -798,7 +802,7 @@ function _combine(f::Any, gd::GroupedDataFrame)
!isa(firstres, Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}))
nms = [Symbol(names(gd.parent)[index(gd.parent)[first(f)]], '_', funname(fun))]
end
valscat = DataFrame(collect(AbstractVector, outcols), collect(Symbol, nms))
valscat = DataFrame(collect(AbstractVector, outcols), nms)
return idx, valscat
end

Expand Down Expand Up @@ -826,9 +830,16 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData
let eltys=eltys, n=n # Workaround for julia#15276
initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first))
end
outcols = _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols,
tuple(propertynames(first)...))
idx, outcols, propertynames(first)
targetcolnames = tuple(propertynames(first)...)
if first isa Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}
outcols, finalcolnames = _combine_tables_with_first!(first, initialcols, idx, 1, 1,
f, gd, incols, targetcolnames)
else
outcols, finalcolnames = _combine_rows_with_first!(first, initialcols, idx, 1, 1,
f, gd, incols, targetcolnames)
end
idx, outcols, collect(Symbol, finalcolnames)
end

function fill_row!(row, outcols::NTuple{N, AbstractVector},
Expand Down Expand Up @@ -864,12 +875,12 @@ function fill_row!(row, outcols::NTuple{N, AbstractVector},
return nothing
end

function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
len = length(gd)
gdidx = gd.idx
starts = gd.starts
Expand Down Expand Up @@ -897,11 +908,12 @@ function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
end
end
end
return _combine_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames)
return _combine_rows_with_first!(row, newcols, idx, i, j,
f, gd, incols, colnames)
end
idx[i] = gdidx[starts[i]]
end
outcols
return outcols, colnames
end

# This needs to be in a separate function
Expand Down Expand Up @@ -935,7 +947,7 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
vals = getproperty(rows, cn)
catch
throw(ArgumentError("return value must have the same column names " *
"for all groups (got $(Tuple(colnames)) and $(Tuple(names(rows))))"))
"for all groups (got $colnames and $(propertynames(rows)))"))
end
S = eltype(vals)
T = eltype(col)
Expand All @@ -946,24 +958,45 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
return nothing
end

function _combine_with_first!(first::Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
function _combine_tables_with_first!(first::Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
len = length(gd)
gdidx = gd.idx
starts = gd.starts
ends = gd.ends
# Handle first group
j = append_rows!(first, outcols, colstart, colnames)
@assert j === nothing # eltype is guaranteed to match
append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))

@assert _ncol(first) == N
if !isempty(colnames)
j = append_rows!(first, outcols, colstart, colnames)
@assert j === nothing # eltype is guaranteed to match
append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))
end
# Handle remaining groups
@inbounds for i in rowstart+1:len
rows = wrap(do_call(f, gdidx, starts, ends, gd, incols, i))
if !(rows isa Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
throw(ArgumentError("return value must not change its kind " *
"(single row or variable number of rows) across groups"))
end
_ncol(rows) == 0 && continue
if isempty(colnames)
newcolnames = tuple(propertynames(rows)...)
if rows isa AbstractDataFrame
eltys = eltype.(eachcol(rows))
else
eltys = map(eltype, rows)
end
initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows))
return _combine_tables_with_first!(rows, initialcols, idx, i, 1,
f, gd, incols, newcolnames)
end
j = append_rows!(rows, outcols, 1, colnames)
if j !== nothing # Need to widen column type
local newcols
Expand All @@ -979,11 +1012,12 @@ function _combine_with_first!(first::Union{AbstractDataFrame,
end
end
end
return _combine_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames)
return _combine_tables_with_first!(rows, newcols, idx, i, j,
f, gd, incols, colnames)
end
append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows)))
end
outcols
return outcols, colnames
end

"""
Expand Down Expand Up @@ -1029,6 +1063,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
additional columns and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.
`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down
48 changes: 41 additions & 7 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,6 @@ end
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame() : DataFrame(x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : DataFrame(), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x)
Expand Down Expand Up @@ -643,29 +641,29 @@ end
by(df, :a, (:c => sum,)) ==
by(df, :a, [:c => sum]) ==
by(df, :a, c_sum = :c => sum) ==
by(d -> (c_sum=sum(d.c),), df, :a)
by(d -> (c_sum=sum(d.c),), df, :a) ==
by(df, :a, d -> (c_sum=sum(d.c),))

@test by(:c => vexp, df, :a) ==
by(df, :a, :c => vexp) ==
by(df, :a, (:c => vexp,)) ==
by(df, :a, [:c => vexp]) ==
by(df, :a, c_function = :c => vexp) ==
by(d -> (c_function=vexp(d.c),), df, :a)
by(d -> (c_function=vexp(d.c),), df, :a) ==
by(df, :a, d -> (c_function=vexp(d.c),))

@test by(df, :a, :b => sum, :c => sum) ==
by(df, :a, (:b => sum, :c => sum,)) ==
by(df, :a, [:b => sum, :c => sum]) ==
by(df, :a, b_sum = :b => sum, c_sum = :c => sum) ==
by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a)
by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) ==
by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))

@test by(df, :a, :b => vexp, :c => identity) ==
by(df, :a, (:b => vexp, :c => identity,)) ==
by(df, :a, [:b => vexp, :c => identity]) ==
by(df, :a, b_function = :b => vexp, c_identity = :c => identity) ==
by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a)
by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) ==
by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c)))

gd = groupby(df, :a)
Expand All @@ -689,7 +687,7 @@ end
combine(gd, c_function = :c => vexp) ==
combine(:c => x -> (c_function=exp.(x),), gd) ==
combine(gd, :c => x -> (c_function=exp.(x),)) ==
combine(d -> (c_function=exp.(d.c),), gd)
combine(d -> (c_function=exp.(d.c),), gd) ==
combine(gd, d -> (c_function=exp.(d.c),))

@test combine(gd, :b => sum, :c => sum) ==
Expand Down Expand Up @@ -1484,4 +1482,40 @@ end
@test by(sdf -> sdf[1, [3,2,1]], df, :a) == df[1:2:5, [1,3,2]]
end

@testset "Allow returning DataFrame() or NamedTuple() to drop group" begin
N = 4
for (i, x1) in enumerate(collect.(Iterators.product(repeat([[true, false]], N)...))),
er in (DataFrame(), view(DataFrame(ones(2,2)), 2:1, 2:1),
view(DataFrame(ones(2,2)), 1:2, 2:1),
NamedTuple(), rand(0,0), rand(5,0),
DataFrame(x1=Int[]), DataFrame(x1=Any[]),
(x1=Int[],), (x1=Any[],), rand(0,1)),
fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true), [true])
df = DataFrame(a = 1:N, x1 = x1)
res = by(sdf -> sdf.x1[1] ? fr : er, df, :a)
@test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a)))
if fr == [true]
if df.x1[1]
@test rename(res, 2=>:x1_function) == by(df, :a, :x1 => x -> x[1] ? fr : er)
else
@test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
end
else
@test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
end
@test res == by(df, :a, (:a, :x1) => x -> x.x1[1] ? fr : er)
if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1)
@test res == DataFrame(a=[])
@test typeof(res.a) == Vector{Int}
else
@test res == df[df.x1, :]
end
if 1 < i < 2^N
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a)
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a)
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a)
end
end
end

end # module

0 comments on commit f9a0f7c

Please sign in to comment.