Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow containers with no columns in combine #2066

Merged
merged 9 commits into from
Jan 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 61 additions & 25 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives the same additional columns
and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.

`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down Expand Up @@ -340,6 +342,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
additional columns and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.

`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down Expand Up @@ -798,7 +802,7 @@ function _combine(f::Any, gd::GroupedDataFrame)
!isa(firstres, Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}))
nms = [Symbol(names(gd.parent)[index(gd.parent)[first(f)]], '_', funname(fun))]
end
valscat = DataFrame(collect(AbstractVector, outcols), collect(Symbol, nms))
valscat = DataFrame(collect(AbstractVector, outcols), nms)
return idx, valscat
end

Expand Down Expand Up @@ -826,9 +830,16 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData
let eltys=eltys, n=n # Workaround for julia#15276
initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first))
end
outcols = _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols,
tuple(propertynames(first)...))
idx, outcols, propertynames(first)
targetcolnames = tuple(propertynames(first)...)
if first isa Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}
outcols, finalcolnames = _combine_tables_with_first!(first, initialcols, idx, 1, 1,
f, gd, incols, targetcolnames)
else
outcols, finalcolnames = _combine_rows_with_first!(first, initialcols, idx, 1, 1,
f, gd, incols, targetcolnames)
end
idx, outcols, collect(Symbol, finalcolnames)
end

function fill_row!(row, outcols::NTuple{N, AbstractVector},
Expand Down Expand Up @@ -864,12 +875,12 @@ function fill_row!(row, outcols::NTuple{N, AbstractVector},
return nothing
end

function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
len = length(gd)
gdidx = gd.idx
starts = gd.starts
Expand Down Expand Up @@ -897,11 +908,12 @@ function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
end
end
end
return _combine_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames)
return _combine_rows_with_first!(row, newcols, idx, i, j,
f, gd, incols, colnames)
end
idx[i] = gdidx[starts[i]]
end
outcols
return outcols, colnames
end

# This needs to be in a separate function
Expand Down Expand Up @@ -935,7 +947,7 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
vals = getproperty(rows, cn)
catch
throw(ArgumentError("return value must have the same column names " *
"for all groups (got $(Tuple(colnames)) and $(Tuple(names(rows))))"))
"for all groups (got $colnames and $(propertynames(rows)))"))
end
S = eltype(vals)
T = eltype(col)
Expand All @@ -946,24 +958,45 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
return nothing
end

function _combine_with_first!(first::Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
function _combine_tables_with_first!(first::Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
outcols::NTuple{N, AbstractVector},
idx::Vector{Int}, rowstart::Integer, colstart::Integer,
f::Any, gd::GroupedDataFrame,
incols::Union{Nothing, AbstractVector, NamedTuple},
colnames::NTuple{N, Symbol}) where N
len = length(gd)
gdidx = gd.idx
starts = gd.starts
ends = gd.ends
# Handle first group
j = append_rows!(first, outcols, colstart, colnames)
@assert j === nothing # eltype is guaranteed to match
append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))

@assert _ncol(first) == N
if !isempty(colnames)
j = append_rows!(first, outcols, colstart, colnames)
@assert j === nothing # eltype is guaranteed to match
append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))
end
# Handle remaining groups
@inbounds for i in rowstart+1:len
rows = wrap(do_call(f, gdidx, starts, ends, gd, incols, i))
if !(rows isa Union{AbstractDataFrame,
NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
throw(ArgumentError("return value must not change its kind " *
"(single row or variable number of rows) across groups"))
end
_ncol(rows) == 0 && continue
if isempty(colnames)
newcolnames = tuple(propertynames(rows)...)
if rows isa AbstractDataFrame
eltys = eltype.(eachcol(rows))
else
eltys = map(eltype, rows)
end
initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows))
return _combine_tables_with_first!(rows, initialcols, idx, i, 1,
f, gd, incols, newcolnames)
end
j = append_rows!(rows, outcols, 1, colnames)
if j !== nothing # Need to widen column type
local newcols
Expand All @@ -979,11 +1012,12 @@ function _combine_with_first!(first::Union{AbstractDataFrame,
end
end
end
return _combine_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames)
return _combine_tables_with_first!(rows, newcols, idx, i, j,
f, gd, incols, colnames)
end
append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows)))
end
outcols
return outcols, colnames
end

"""
Expand Down Expand Up @@ -1029,6 +1063,8 @@ which is determined using the following rules:
for each group as the length of the returned vector for that group.
- A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
additional columns and as many rows for each group as the rows returned for that group.
As a special case, returning an empty table with zero columns is allowed,
whatever the number of columns returned for other groups.

`f` must always return the same kind of object (as defined in the above list) for
all groups, and if a named tuple or data frame, with the same fields or columns.
Expand Down
48 changes: 41 additions & 7 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,6 @@ end
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame() : DataFrame(x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : DataFrame(), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x)
@test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x)
Expand Down Expand Up @@ -643,29 +641,29 @@ end
by(df, :a, (:c => sum,)) ==
by(df, :a, [:c => sum]) ==
by(df, :a, c_sum = :c => sum) ==
by(d -> (c_sum=sum(d.c),), df, :a)
by(d -> (c_sum=sum(d.c),), df, :a) ==
bkamins marked this conversation as resolved.
Show resolved Hide resolved
by(df, :a, d -> (c_sum=sum(d.c),))

@test by(:c => vexp, df, :a) ==
by(df, :a, :c => vexp) ==
by(df, :a, (:c => vexp,)) ==
by(df, :a, [:c => vexp]) ==
by(df, :a, c_function = :c => vexp) ==
by(d -> (c_function=vexp(d.c),), df, :a)
by(d -> (c_function=vexp(d.c),), df, :a) ==
by(df, :a, d -> (c_function=vexp(d.c),))

@test by(df, :a, :b => sum, :c => sum) ==
by(df, :a, (:b => sum, :c => sum,)) ==
by(df, :a, [:b => sum, :c => sum]) ==
by(df, :a, b_sum = :b => sum, c_sum = :c => sum) ==
by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a)
by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) ==
by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))

@test by(df, :a, :b => vexp, :c => identity) ==
by(df, :a, (:b => vexp, :c => identity,)) ==
by(df, :a, [:b => vexp, :c => identity]) ==
by(df, :a, b_function = :b => vexp, c_identity = :c => identity) ==
by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a)
by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) ==
by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c)))

gd = groupby(df, :a)
Expand All @@ -689,7 +687,7 @@ end
combine(gd, c_function = :c => vexp) ==
combine(:c => x -> (c_function=exp.(x),), gd) ==
combine(gd, :c => x -> (c_function=exp.(x),)) ==
combine(d -> (c_function=exp.(d.c),), gd)
combine(d -> (c_function=exp.(d.c),), gd) ==
combine(gd, d -> (c_function=exp.(d.c),))

@test combine(gd, :b => sum, :c => sum) ==
Expand Down Expand Up @@ -1480,4 +1478,40 @@ end
@test by(sdf -> sdf[1, [3,2,1]], df, :a) == df[1:2:5, [1,3,2]]
end

@testset "Allow returning DataFrame() or NamedTuple() to drop group" begin
N = 4
for (i, x1) in enumerate(collect.(Iterators.product(repeat([[true, false]], N)...))),
er in (DataFrame(), view(DataFrame(ones(2,2)), 2:1, 2:1),
view(DataFrame(ones(2,2)), 1:2, 2:1),
NamedTuple(), rand(0,0), rand(5,0),
DataFrame(x1=Int[]), DataFrame(x1=Any[]),
(x1=Int[],), (x1=Any[],), rand(0,1)),
fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true), [true])
df = DataFrame(a = 1:N, x1 = x1)
res = by(sdf -> sdf.x1[1] ? fr : er, df, :a)
@test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a)))
if fr == [true]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have a column naming issue here. It is unrelated to this PR:

julia> by(:a => x -> x[1] != 1 ? (x1=[2],) : [1], DataFrame(a=1:2), :a)
2×2 DataFrame
│ Row │ a     │ a_function │
│     │ Int64 │ Int64      │
├─────┼───────┼────────────┤
│ 1   │ 1     │ 1          │
│ 2   │ 2     │ 2          │

julia> by(:a => x -> x[1] == 1 ? (x1=[1],) : [2], DataFrame(a=1:2), :a)
2×2 DataFrame
│ Row │ a     │ x1    │
│     │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1   │ 1     │ 1     │
│ 2   │ 2     │ 2     │

What do you think we should do in such cases (now we inherit column name from the first row in a different way than we check when combining for name consistency).

The problematic line is https://github.com/JuliaData/DataFrames.jl/pull/2066/files#diff-23657e51a9cc9e627fc153ba1e6e04c1L799, as it "manually" overrides column naming mechanics using in combine code.

As you can see it can lead to significant inconsistency (column name passed in NamedTuple got silently overriden by a_function).

An alternative solution would be to disallow mixing vectors and named tuple of a single vector (but current design allows this).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's not great, but I'm not sure what better solution we could find. Ideally I guess we could find whether at least one named tuple has been returned, and take its names (that would be doable if we pass a Boolean to _combine_to_with_first!). Maybe file an issue about that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have opened #2071. Actually I think this should be an error (unless the automatic column name for a vector and the actual column name in named tuple match)

if df.x1[1]
@test rename(res, 2=>:x1_function) == by(df, :a, :x1 => x -> x[1] ? fr : er)
else
@test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
end
else
@test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
end
@test res == by(df, :a, (:a, :x1) => x -> x.x1[1] ? fr : er)
if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1)
@test res == DataFrame(a=[])
@test typeof(res.a) == Vector{Int}
else
@test res == df[df.x1, :]
end
if 1 < i < 2^N
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a)
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a)
@test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a)
end
end
end

end # module