From 98c74800c772eacb4b301544edd97e90f01e7257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 26 Dec 2019 15:54:52 +0100 Subject: [PATCH 1/8] allow containers with no columns in combine --- src/groupeddataframe/splitapplycombine.jl | 51 +++++++++++++++++------ test/grouping.jl | 47 +++++++++++++++++---- 2 files changed, 78 insertions(+), 20 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 72aaa348b6..d03b1cadd3 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -201,6 +201,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives the same additional columns and as many rows for each group as the rows returned for that group. + In this case returning a table with no columns drops group. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -340,6 +341,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. + In this case returning a table with no columns drops group. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -790,7 +792,7 @@ function _combine(f::Any, gd::GroupedDataFrame) !isa(firstres, Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix})) nms = [Symbol(names(gd.parent)[index(gd.parent)[first(f)]], '_', funname(fun))] end - valscat = DataFrame(collect(AbstractVector, outcols), collect(Symbol, nms)) + valscat = DataFrame(collect(AbstractVector, outcols), nms) return idx, valscat end @@ -818,14 +820,17 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData let eltys=eltys, n=n # Workaround for julia#15276 initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first)) end - outcols = _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, - tuple(propertynames(first)...)) - idx, outcols, propertynames(first) + targetcolnames = collect(Symbol, propertynames(first)) + outcols = first isa Union{AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} ? + _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) : + _combine_with_first_row!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + idx, outcols, targetcolnames end function fill_row!(row, outcols::NTuple{N, AbstractVector}, i::Integer, colstart::Integer, - colnames::NTuple{N, Symbol}) where N + colnames::Vector{Symbol}) where N if !isa(row, Union{NamedTuple, DataFrameRow}) throw(ArgumentError("return value must not change its kind " * "(single row or variable number of rows) across groups")) @@ -856,12 +861,12 @@ function fill_row!(row, outcols::NTuple{N, AbstractVector}, return nothing end -function _combine_with_first!(first::Union{NamedTuple, DataFrameRow}, +function _combine_with_first_row!(first::Union{NamedTuple, DataFrameRow}, outcols::NTuple{N, AbstractVector}, idx::Vector{Int}, rowstart::Integer, colstart::Integer, f::Any, gd::GroupedDataFrame, incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::NTuple{N, Symbol}) where N + colnames::Vector{Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts @@ -889,7 +894,7 @@ function _combine_with_first!(first::Union{NamedTuple, DataFrameRow}, end end end - return _combine_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames) + return _combine_with_first_row!(row, newcols, idx, i, j, f, gd, incols, colnames) end idx[i] = gdidx[starts[i]] end @@ -911,7 +916,7 @@ else end function append_rows!(rows, outcols::NTuple{N, AbstractVector}, - colstart::Integer, colnames::NTuple{N, Symbol}) where N + colstart::Integer, colnames::Vector{Symbol}) where N if !isa(rows, Union{AbstractDataFrame, NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) throw(ArgumentError("return value must not change its kind " * "(single row or variable number of rows) across groups")) @@ -944,18 +949,37 @@ function _combine_with_first!(first::Union{AbstractDataFrame, idx::Vector{Int}, rowstart::Integer, colstart::Integer, f::Any, gd::GroupedDataFrame, incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::NTuple{N, Symbol}) where N + colnames::Vector{Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts ends = gd.ends # Handle first group - j = append_rows!(first, outcols, colstart, colnames) - @assert j === nothing # eltype is guaranteed to match - append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first))) + + @assert isempty(colnames) == (_ncol(first) == 0) == isempty(outcols) + if !isempty(colnames) + j = append_rows!(first, outcols, colstart, colnames) + @assert j === nothing # eltype is guaranteed to match + append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first))) + end # Handle remaining groups @inbounds for i in rowstart+1:len rows = wrap(do_call(f, gdidx, starts, ends, gd, incols, i)) + if !(rows isa Union{AbstractDataFrame, NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + throw(ArgumentError("return value must not change its kind " * + "(single row or variable number of rows) across groups")) + end + _ncol(rows) == 0 && continue + if isempty(colnames) + append!(colnames, collect(Symbol, propertynames(rows))) + if rows isa AbstractDataFrame + eltys = eltype.(eachcol(rows)) + else + eltys = map(eltype, rows) + end + initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows)) + return _combine_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, colnames) + end j = append_rows!(rows, outcols, 1, colnames) if j !== nothing # Need to widen column type local newcols @@ -1021,6 +1045,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. + In this case returning a table with no columns drops group. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. diff --git a/test/grouping.jl b/test/grouping.jl index 4e7a597883..6ed6686fae 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -421,8 +421,6 @@ end @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x) @test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x) @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame() : DataFrame(x1=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : DataFrame(), df, :x) @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x) @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x) @test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x) @@ -643,7 +641,7 @@ end by(df, :a, (:c => sum,)) == by(df, :a, [:c => sum]) == by(df, :a, c_sum = :c => sum) == - by(d -> (c_sum=sum(d.c),), df, :a) + by(d -> (c_sum=sum(d.c),), df, :a) == by(df, :a, d -> (c_sum=sum(d.c),)) @test by(:c => vexp, df, :a) == @@ -651,21 +649,21 @@ end by(df, :a, (:c => vexp,)) == by(df, :a, [:c => vexp]) == by(df, :a, c_function = :c => vexp) == - by(d -> (c_function=vexp(d.c),), df, :a) + by(d -> (c_function=vexp(d.c),), df, :a) == by(df, :a, d -> (c_function=vexp(d.c),)) @test by(df, :a, :b => sum, :c => sum) == by(df, :a, (:b => sum, :c => sum,)) == by(df, :a, [:b => sum, :c => sum]) == by(df, :a, b_sum = :b => sum, c_sum = :c => sum) == - by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) + by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) == by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) @test by(df, :a, :b => vexp, :c => identity) == by(df, :a, (:b => vexp, :c => identity,)) == by(df, :a, [:b => vexp, :c => identity]) == by(df, :a, b_function = :b => vexp, c_identity = :c => identity) == - by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) + by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) == by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c))) gd = groupby(df, :a) @@ -689,7 +687,7 @@ end combine(gd, c_function = :c => vexp) == combine(:c => x -> (c_function=exp.(x),), gd) == combine(gd, :c => x -> (c_function=exp.(x),)) == - combine(d -> (c_function=exp.(d.c),), gd) + combine(d -> (c_function=exp.(d.c),), gd) == combine(gd, d -> (c_function=exp.(d.c),)) @test combine(gd, :b => sum, :c => sum) == @@ -1458,4 +1456,39 @@ end @test_throws KeyError gd[(a=:A, b=:X)] end +@testset "Allow returning DataFrame() or NamedTuple() to drop group" begin + for i in typemin(UInt8):typemax(UInt8), er in (DataFrame(), NamedTuple(), rand(0,0), rand(5,0)) + df = DataFrame(a = 1:8, x1 = Bool.(collect(bitstring(UInt8(i))) .- '0')) + res = by(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, df, :a) + @test res == DataFrame(map(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, groupby(df, :a))) + if nrow(res) == 0 + @test res == DataFrame(a=Int[]) + else + @test res == df[df.x1, :] + end + + res = by(sdf -> sdf.x1[1] ? (x1=[true],) : er, df, :a) + if typemin(UInt8) < i < typemax(UInt8) + @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a) + end + @test res == DataFrame(map(sdf -> sdf.x1[1] ? (x1=[true],) : er, groupby(df, :a))) + if nrow(res) == 0 + @test res == DataFrame(a=Int[]) + else + @test res == df[df.x1, :] + end + + res = by(sdf -> sdf.x1[1] ? hcat(true) : er, df, :a) + if typemin(UInt8) < i < typemax(UInt8) + @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a) + end + @test res == DataFrame(map(sdf -> sdf.x1[1] ? hcat(true) : er, groupby(df, :a))) + if nrow(res) == 0 + @test res == DataFrame(a=Int[]) + else + @test res == df[df.x1, :] + end + end +end + end # module From a2bcaa56335e83fb62e043ffd3beab9467952e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 26 Dec 2019 21:24:30 +0100 Subject: [PATCH 2/8] add SubDataFrame to tests --- test/grouping.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/grouping.jl b/test/grouping.jl index 6ed6686fae..f6b28416a1 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1457,7 +1457,10 @@ end end @testset "Allow returning DataFrame() or NamedTuple() to drop group" begin - for i in typemin(UInt8):typemax(UInt8), er in (DataFrame(), NamedTuple(), rand(0,0), rand(5,0)) + for i in typemin(UInt8):typemax(UInt8), + er in (DataFrame(), view(DataFrame(ones(2,2)), 2:1, 2:1), + view(DataFrame(ones(2,2)), 1:2, 2:1), + NamedTuple(), rand(0,0), rand(5,0)) df = DataFrame(a = 1:8, x1 = Bool.(collect(bitstring(UInt8(i))) .- '0')) res = by(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, df, :a) @test res == DataFrame(map(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, groupby(df, :a))) From 964520ad7c7d40667b0b0a6aedb72a8c687becdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 26 Dec 2019 22:05:28 +0100 Subject: [PATCH 3/8] retain colnames as Tuple --- src/groupeddataframe/splitapplycombine.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index d03b1cadd3..e4b1349169 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -820,17 +820,17 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData let eltys=eltys, n=n # Workaround for julia#15276 initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first)) end - targetcolnames = collect(Symbol, propertynames(first)) - outcols = first isa Union{AbstractDataFrame, + targetcolnames = tuple(propertynames(first)...) + outcols, finalcolnames = first isa Union{AbstractDataFrame, NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} ? _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) : _combine_with_first_row!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) - idx, outcols, targetcolnames + idx, outcols, collect(Symbol, finalcolnames) end function fill_row!(row, outcols::NTuple{N, AbstractVector}, i::Integer, colstart::Integer, - colnames::Vector{Symbol}) where N + colnames::NTuple{N, Symbol}) where N if !isa(row, Union{NamedTuple, DataFrameRow}) throw(ArgumentError("return value must not change its kind " * "(single row or variable number of rows) across groups")) @@ -866,7 +866,7 @@ function _combine_with_first_row!(first::Union{NamedTuple, DataFrameRow}, idx::Vector{Int}, rowstart::Integer, colstart::Integer, f::Any, gd::GroupedDataFrame, incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::Vector{Symbol}) where N + colnames::NTuple{N, Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts @@ -898,7 +898,7 @@ function _combine_with_first_row!(first::Union{NamedTuple, DataFrameRow}, end idx[i] = gdidx[starts[i]] end - outcols + return outcols, colnames end # This needs to be in a separate function @@ -916,7 +916,7 @@ else end function append_rows!(rows, outcols::NTuple{N, AbstractVector}, - colstart::Integer, colnames::Vector{Symbol}) where N + colstart::Integer, colnames::NTuple{N, Symbol}) where N if !isa(rows, Union{AbstractDataFrame, NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) throw(ArgumentError("return value must not change its kind " * "(single row or variable number of rows) across groups")) @@ -949,7 +949,7 @@ function _combine_with_first!(first::Union{AbstractDataFrame, idx::Vector{Int}, rowstart::Integer, colstart::Integer, f::Any, gd::GroupedDataFrame, incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::Vector{Symbol}) where N + colnames::NTuple{N, Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts @@ -971,14 +971,14 @@ function _combine_with_first!(first::Union{AbstractDataFrame, end _ncol(rows) == 0 && continue if isempty(colnames) - append!(colnames, collect(Symbol, propertynames(rows))) + newcolnames = tuple(propertynames(rows)...) if rows isa AbstractDataFrame eltys = eltype.(eachcol(rows)) else eltys = map(eltype, rows) end initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows)) - return _combine_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, colnames) + return _combine_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, newcolnames) end j = append_rows!(rows, outcols, 1, colnames) if j !== nothing # Need to widen column type @@ -999,7 +999,7 @@ function _combine_with_first!(first::Union{AbstractDataFrame, end append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows))) end - outcols + return outcols, colnames end """ From 14ed8f2bc74f7cfcab2bf06eee9a5efe29f2eb61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 27 Dec 2019 23:50:27 +0100 Subject: [PATCH 4/8] Apply suggestions from code review Co-Authored-By: Milan Bouchet-Valat --- test/grouping.jl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 840ed08ca1..789bd96cad 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1485,9 +1485,10 @@ end NamedTuple(), rand(0,0), rand(5,0)) df = DataFrame(a = 1:8, x1 = Bool.(collect(bitstring(UInt8(i))) .- '0')) res = by(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, df, :a) - @test res == DataFrame(map(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, groupby(df, :a))) + @test res == DataFrame(map(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, groupby_checked(df, :a))) if nrow(res) == 0 - @test res == DataFrame(a=Int[]) + @test res == DataFrame(a=[]) + @test typeof(res.a) == Vector{Int} else @test res == df[df.x1, :] end @@ -1496,9 +1497,10 @@ end if typemin(UInt8) < i < typemax(UInt8) @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a) end - @test res == DataFrame(map(sdf -> sdf.x1[1] ? (x1=[true],) : er, groupby(df, :a))) + @test res == DataFrame(map(sdf -> sdf.x1[1] ? (x1=[true],) : er, groupby_checked(df, :a))) if nrow(res) == 0 - @test res == DataFrame(a=Int[]) + @test res == DataFrame(a=[]) + @test typeof(res.a) == Vector{Int} else @test res == df[df.x1, :] end @@ -1507,9 +1509,10 @@ end if typemin(UInt8) < i < typemax(UInt8) @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a) end - @test res == DataFrame(map(sdf -> sdf.x1[1] ? hcat(true) : er, groupby(df, :a))) + @test res == DataFrame(map(sdf -> sdf.x1[1] ? hcat(true) : er, groupby_checked(df, :a))) if nrow(res) == 0 - @test res == DataFrame(a=Int[]) + @test res == DataFrame(a=[]) + @test typeof(res.a) == Vector{Int} else @test res == df[df.x1, :] end From 4c0a4943770e445e65e75353ec433f80fbe5136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 28 Dec 2019 00:57:52 +0100 Subject: [PATCH 5/8] fixes after the code review --- src/groupeddataframe/splitapplycombine.jl | 52 ++++++++++++----------- test/grouping.jl | 41 ++++++------------ 2 files changed, 41 insertions(+), 52 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 1b19796602..826654fb0a 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -201,7 +201,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives the same additional columns and as many rows for each group as the rows returned for that group. - In this case returning a table with no columns drops group. + (a table with zero columns drops the group) `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -341,7 +341,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. - In this case returning a table with no columns drops group. + (a table with zero columns drops the group) `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -829,10 +829,12 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first)) end targetcolnames = tuple(propertynames(first)...) - outcols, finalcolnames = first isa Union{AbstractDataFrame, - NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} ? - _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) : - _combine_with_first_row!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + outcols, finalcolnames = if first isa Union{AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} + _combine_tables_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + else + _combine_rows_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + end idx, outcols, collect(Symbol, finalcolnames) end @@ -869,12 +871,12 @@ function fill_row!(row, outcols::NTuple{N, AbstractVector}, return nothing end -function _combine_with_first_row!(first::Union{NamedTuple, DataFrameRow}, - outcols::NTuple{N, AbstractVector}, - idx::Vector{Int}, rowstart::Integer, colstart::Integer, - f::Any, gd::GroupedDataFrame, - incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::NTuple{N, Symbol}) where N +function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow}, + outcols::NTuple{N, AbstractVector}, + idx::Vector{Int}, rowstart::Integer, colstart::Integer, + f::Any, gd::GroupedDataFrame, + incols::Union{Nothing, AbstractVector, NamedTuple}, + colnames::NTuple{N, Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts @@ -902,7 +904,7 @@ function _combine_with_first_row!(first::Union{NamedTuple, DataFrameRow}, end end end - return _combine_with_first_row!(row, newcols, idx, i, j, f, gd, incols, colnames) + return _combine_rows_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames) end idx[i] = gdidx[starts[i]] end @@ -940,7 +942,7 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector}, vals = getproperty(rows, cn) catch throw(ArgumentError("return value must have the same column names " * - "for all groups (got $(Tuple(colnames)) and $(Tuple(names(rows))))")) + "for all groups (got $colnames and $(propertynames(rows)))")) end S = eltype(vals) T = eltype(col) @@ -951,20 +953,20 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector}, return nothing end -function _combine_with_first!(first::Union{AbstractDataFrame, - NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}, - outcols::NTuple{N, AbstractVector}, - idx::Vector{Int}, rowstart::Integer, colstart::Integer, - f::Any, gd::GroupedDataFrame, - incols::Union{Nothing, AbstractVector, NamedTuple}, - colnames::NTuple{N, Symbol}) where N +function _combine_tables_with_first!(first::Union{AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}, + outcols::NTuple{N, AbstractVector}, + idx::Vector{Int}, rowstart::Integer, colstart::Integer, + f::Any, gd::GroupedDataFrame, + incols::Union{Nothing, AbstractVector, NamedTuple}, + colnames::NTuple{N, Symbol}) where N len = length(gd) gdidx = gd.idx starts = gd.starts ends = gd.ends # Handle first group - @assert isempty(colnames) == (_ncol(first) == 0) == isempty(outcols) + @assert _ncol(first) == N if !isempty(colnames) j = append_rows!(first, outcols, colstart, colnames) @assert j === nothing # eltype is guaranteed to match @@ -986,7 +988,7 @@ function _combine_with_first!(first::Union{AbstractDataFrame, eltys = map(eltype, rows) end initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows)) - return _combine_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, newcolnames) + return _combine_tables_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, newcolnames) end j = append_rows!(rows, outcols, 1, colnames) if j !== nothing # Need to widen column type @@ -1003,7 +1005,7 @@ function _combine_with_first!(first::Union{AbstractDataFrame, end end end - return _combine_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames) + return _combine_tables_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames) end append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows))) end @@ -1053,7 +1055,7 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. - In this case returning a table with no columns drops group. + (a table with zero columns drops the group) `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. diff --git a/test/grouping.jl b/test/grouping.jl index 789bd96cad..2182257922 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1479,43 +1479,30 @@ end end @testset "Allow returning DataFrame() or NamedTuple() to drop group" begin - for i in typemin(UInt8):typemax(UInt8), + N = 4 + for (i, x1) in enumerate(collect.(Iterators.product(repeat([[true, false]], N)...))), er in (DataFrame(), view(DataFrame(ones(2,2)), 2:1, 2:1), view(DataFrame(ones(2,2)), 1:2, 2:1), - NamedTuple(), rand(0,0), rand(5,0)) - df = DataFrame(a = 1:8, x1 = Bool.(collect(bitstring(UInt8(i))) .- '0')) - res = by(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, df, :a) - @test res == DataFrame(map(sdf -> sdf.x1[1] ? DataFrame(x1=[true]) : er, groupby_checked(df, :a))) - if nrow(res) == 0 + NamedTuple(), rand(0,0), rand(5,0), + DataFrame(x1=Int[]), DataFrame(x1=Any[]), + (x1=Int[],), (x1=Any[],), rand(0,1)), + fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true)) + df = DataFrame(a = 1:N, x1 = x1) + res = by(sdf -> sdf.x1[1] ? fr : er, df, :a) + @test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a))) + @test res == by(df, :a, :x1 => x -> x[1] ? fr : er) + @test res == by(df, :a, (:a, :x1) => x -> x.x1[1] ? fr : er) + if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1) @test res == DataFrame(a=[]) @test typeof(res.a) == Vector{Int} else @test res == df[df.x1, :] end - - res = by(sdf -> sdf.x1[1] ? (x1=[true],) : er, df, :a) - if typemin(UInt8) < i < typemax(UInt8) + if 1 < i < 2^N @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a) - end - @test res == DataFrame(map(sdf -> sdf.x1[1] ? (x1=[true],) : er, groupby_checked(df, :a))) - if nrow(res) == 0 - @test res == DataFrame(a=[]) - @test typeof(res.a) == Vector{Int} - else - @test res == df[df.x1, :] - end - - res = by(sdf -> sdf.x1[1] ? hcat(true) : er, df, :a) - if typemin(UInt8) < i < typemax(UInt8) + @test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a) @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a) end - @test res == DataFrame(map(sdf -> sdf.x1[1] ? hcat(true) : er, groupby_checked(df, :a))) - if nrow(res) == 0 - @test res == DataFrame(a=[]) - @test typeof(res.a) == Vector{Int} - else - @test res == df[df.x1, :] - end end end From d461a0e6e1075a378e464ec352af82526aa3f815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 28 Dec 2019 01:38:01 +0100 Subject: [PATCH 6/8] add more tests --- test/grouping.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 2182257922..d269d1ecee 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1486,11 +1486,19 @@ end NamedTuple(), rand(0,0), rand(5,0), DataFrame(x1=Int[]), DataFrame(x1=Any[]), (x1=Int[],), (x1=Any[],), rand(0,1)), - fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true)) + fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true), [true]) df = DataFrame(a = 1:N, x1 = x1) res = by(sdf -> sdf.x1[1] ? fr : er, df, :a) @test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a))) - @test res == by(df, :a, :x1 => x -> x[1] ? fr : er) + if fr == [true] + if df.x1[1] + @test rename(res, 2=>:x1_function) == by(df, :a, :x1 => x -> x[1] ? fr : er) + else + @test res == by(df, :a, :x1 => x -> x[1] ? fr : er) + end + else + @test res == by(df, :a, :x1 => x -> x[1] ? fr : er) + end @test res == by(df, :a, (:a, :x1) => x -> x.x1[1] ? fr : er) if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1) @test res == DataFrame(a=[]) From 39ab56d3ea5ce6d3b2576629f867bbe7e00261f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 28 Dec 2019 21:48:02 +0100 Subject: [PATCH 7/8] improve docstring --- src/groupeddataframe/splitapplycombine.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 826654fb0a..3e0a6438f1 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -201,7 +201,8 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives the same additional columns and as many rows for each group as the rows returned for that group. - (a table with zero columns drops the group) + As a special case, returning an empty table with zero columns is allowed, + whatever the number of columns returned for other groups. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -341,7 +342,8 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. - (a table with zero columns drops the group) + As a special case, returning an empty table with zero columns is allowed, + whatever the number of columns returned for other groups. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. @@ -1055,7 +1057,8 @@ which is determined using the following rules: for each group as the length of the returned vector for that group. - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same additional columns and as many rows for each group as the rows returned for that group. - (a table with zero columns drops the group) + As a special case, returning an empty table with zero columns is allowed, + whatever the number of columns returned for other groups. `f` must always return the same kind of object (as defined in the above list) for all groups, and if a named tuple or data frame, with the same fields or columns. From 6b9b8aa68d74a131f4e2a4089861099b50c06ad2 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 2 Jan 2020 18:32:49 +0100 Subject: [PATCH 8/8] Break lines at 92 chars --- src/groupeddataframe/splitapplycombine.jl | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 3e0a6438f1..ab6cfc481c 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -831,11 +831,13 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first)) end targetcolnames = tuple(propertynames(first)...) - outcols, finalcolnames = if first isa Union{AbstractDataFrame, - NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} - _combine_tables_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + if first isa Union{AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}} + outcols, finalcolnames = _combine_tables_with_first!(first, initialcols, idx, 1, 1, + f, gd, incols, targetcolnames) else - _combine_rows_with_first!(first, initialcols, idx, 1, 1, f, gd, incols, targetcolnames) + outcols, finalcolnames = _combine_rows_with_first!(first, initialcols, idx, 1, 1, + f, gd, incols, targetcolnames) end idx, outcols, collect(Symbol, finalcolnames) end @@ -906,7 +908,8 @@ function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow}, end end end - return _combine_rows_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames) + return _combine_rows_with_first!(row, newcols, idx, i, j, + f, gd, incols, colnames) end idx[i] = gdidx[starts[i]] end @@ -977,7 +980,8 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, # Handle remaining groups @inbounds for i in rowstart+1:len rows = wrap(do_call(f, gdidx, starts, ends, gd, incols, i)) - if !(rows isa Union{AbstractDataFrame, NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + if !(rows isa Union{AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) throw(ArgumentError("return value must not change its kind " * "(single row or variable number of rows) across groups")) end @@ -990,7 +994,8 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, eltys = map(eltype, rows) end initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows)) - return _combine_tables_with_first!(rows, initialcols, idx, i, 1, f, gd, incols, newcolnames) + return _combine_tables_with_first!(rows, initialcols, idx, i, 1, + f, gd, incols, newcolnames) end j = append_rows!(rows, outcols, 1, colnames) if j !== nothing # Need to widen column type @@ -1007,7 +1012,8 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, end end end - return _combine_tables_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames) + return _combine_tables_with_first!(rows, newcols, idx, i, j, + f, gd, incols, colnames) end append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows))) end