Allow tables with no columns in combine to drop groups

JuliaData · Jan 2, 2020 · f9a0f7c · f9a0f7c
1 parent 9c8de96
commit f9a0f7c
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 32 deletions.
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
@@ -201,6 +201,8 @@ which is determined using the following rules:
   for each group as the length of the returned vector for that group.
 - A data frame, a named tuple of vectors or a matrix gives the same additional columns
   and as many rows for each group as the rows returned for that group.
+  As a special case, returning an empty table with zero columns is allowed,
+  whatever the number of columns returned for other groups.
 
 `f` must always return the same kind of object (as defined in the above list) for
 all groups, and if a named tuple or data frame, with the same fields or columns.
@@ -340,6 +342,8 @@ which is determined using the following rules:
   for each group as the length of the returned vector for that group.
 - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
   additional columns and as many rows for each group as the rows returned for that group.
+  As a special case, returning an empty table with zero columns is allowed,
+  whatever the number of columns returned for other groups.
 
 `f` must always return the same kind of object (as defined in the above list) for
 all groups, and if a named tuple or data frame, with the same fields or columns.
@@ -798,7 +802,7 @@ function _combine(f::Any, gd::GroupedDataFrame)
          !isa(firstres, Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}))
          nms = [Symbol(names(gd.parent)[index(gd.parent)[first(f)]], '_', funname(fun))]
     end
-    valscat = DataFrame(collect(AbstractVector, outcols), collect(Symbol, nms))
+    valscat = DataFrame(collect(AbstractVector, outcols), nms)
     return idx, valscat
 end
 
@@ -826,9 +830,16 @@ function _combine_with_first(first::Union{NamedTuple, DataFrameRow, AbstractData
     let eltys=eltys, n=n # Workaround for julia#15276
         initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], n), _ncol(first))
     end
-    outcols = _combine_with_first!(first, initialcols, idx, 1, 1, f, gd, incols,
-                                   tuple(propertynames(first)...))
-    idx, outcols, propertynames(first)
+    targetcolnames = tuple(propertynames(first)...)
+    if first isa Union{AbstractDataFrame,
+                       NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}
+        outcols, finalcolnames = _combine_tables_with_first!(first, initialcols, idx, 1, 1,
+                                                             f, gd, incols, targetcolnames)
+    else
+        outcols, finalcolnames = _combine_rows_with_first!(first, initialcols, idx, 1, 1,
+                                                           f, gd, incols, targetcolnames)
+    end
+    idx, outcols, collect(Symbol, finalcolnames)
 end
 
 function fill_row!(row, outcols::NTuple{N, AbstractVector},
@@ -864,12 +875,12 @@ function fill_row!(row, outcols::NTuple{N, AbstractVector},
     return nothing
 end
 
-function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
-                              outcols::NTuple{N, AbstractVector},
-                              idx::Vector{Int}, rowstart::Integer, colstart::Integer,
-                              f::Any, gd::GroupedDataFrame,
-                              incols::Union{Nothing, AbstractVector, NamedTuple},
-                              colnames::NTuple{N, Symbol}) where N
+function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow},
+                                   outcols::NTuple{N, AbstractVector},
+                                   idx::Vector{Int}, rowstart::Integer, colstart::Integer,
+                                   f::Any, gd::GroupedDataFrame,
+                                   incols::Union{Nothing, AbstractVector, NamedTuple},
+                                   colnames::NTuple{N, Symbol}) where N
     len = length(gd)
     gdidx = gd.idx
     starts = gd.starts
@@ -897,11 +908,12 @@ function _combine_with_first!(first::Union{NamedTuple, DataFrameRow},
                     end
                 end
             end
-            return _combine_with_first!(row, newcols, idx, i, j, f, gd, incols, colnames)
+            return _combine_rows_with_first!(row, newcols, idx, i, j,
+                                             f, gd, incols, colnames)
         end
         idx[i] = gdidx[starts[i]]
     end
-    outcols
+    return outcols, colnames
 end
 
 # This needs to be in a separate function
@@ -935,7 +947,7 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
             vals = getproperty(rows, cn)
         catch
             throw(ArgumentError("return value must have the same column names " *
-                                "for all groups (got $(Tuple(colnames)) and $(Tuple(names(rows))))"))
+                                "for all groups (got $colnames and $(propertynames(rows)))"))
         end
         S = eltype(vals)
         T = eltype(col)
@@ -946,24 +958,45 @@ function append_rows!(rows, outcols::NTuple{N, AbstractVector},
     return nothing
 end
 
-function _combine_with_first!(first::Union{AbstractDataFrame,
-                                           NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
-                              outcols::NTuple{N, AbstractVector},
-                              idx::Vector{Int}, rowstart::Integer, colstart::Integer,
-                              f::Any, gd::GroupedDataFrame,
-                              incols::Union{Nothing, AbstractVector, NamedTuple},
-                              colnames::NTuple{N, Symbol}) where N
+function _combine_tables_with_first!(first::Union{AbstractDataFrame,
+                                     NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}},
+                                     outcols::NTuple{N, AbstractVector},
+                                     idx::Vector{Int}, rowstart::Integer, colstart::Integer,
+                                     f::Any, gd::GroupedDataFrame,
+                                     incols::Union{Nothing, AbstractVector, NamedTuple},
+                                     colnames::NTuple{N, Symbol}) where N
     len = length(gd)
     gdidx = gd.idx
     starts = gd.starts
     ends = gd.ends
     # Handle first group
-    j = append_rows!(first, outcols, colstart, colnames)
-    @assert j === nothing # eltype is guaranteed to match
-    append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))
+
+    @assert _ncol(first) == N
+    if !isempty(colnames)
+        j = append_rows!(first, outcols, colstart, colnames)
+        @assert j === nothing # eltype is guaranteed to match
+        append!(idx, Iterators.repeated(gdidx[starts[rowstart]], _nrow(first)))
+    end
     # Handle remaining groups
     @inbounds for i in rowstart+1:len
         rows = wrap(do_call(f, gdidx, starts, ends, gd, incols, i))
+        if !(rows isa Union{AbstractDataFrame,
+                            NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
+            throw(ArgumentError("return value must not change its kind " *
+                                "(single row or variable number of rows) across groups"))
+        end
+        _ncol(rows) == 0 && continue
+        if isempty(colnames)
+            newcolnames = tuple(propertynames(rows)...)
+            if rows isa AbstractDataFrame
+                eltys = eltype.(eachcol(rows))
+            else
+                eltys = map(eltype, rows)
+            end
+            initialcols = ntuple(i -> Tables.allocatecolumn(eltys[i], 0), _ncol(rows))
+            return _combine_tables_with_first!(rows, initialcols, idx, i, 1,
+                                               f, gd, incols, newcolnames)
+        end
         j = append_rows!(rows, outcols, 1, colnames)
         if j !== nothing # Need to widen column type
             local newcols
@@ -979,11 +1012,12 @@ function _combine_with_first!(first::Union{AbstractDataFrame,
                     end
                 end
             end
-            return _combine_with_first!(rows, newcols, idx, i, j, f, gd, incols, colnames)
+            return _combine_tables_with_first!(rows, newcols, idx, i, j,
+                                               f, gd, incols, colnames)
         end
         append!(idx, Iterators.repeated(gdidx[starts[i]], _nrow(rows)))
     end
-    outcols
+    return outcols, colnames
 end
 
 """
@@ -1029,6 +1063,8 @@ which is determined using the following rules:
   for each group as the length of the returned vector for that group.
 - A data frame, a named tuple of vectors or a matrix gives a `DataFrame` with the same
   additional columns and as many rows for each group as the rows returned for that group.
+  As a special case, returning an empty table with zero columns is allowed,
+  whatever the number of columns returned for other groups.
 
 `f` must always return the same kind of object (as defined in the above list) for
 all groups, and if a named tuple or data frame, with the same fields or columns.

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -421,8 +421,6 @@ end
     @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x)
     @test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x)
     @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame() : DataFrame(x1=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : DataFrame(), df, :x)
     @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x)
     @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x)
     @test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x)
@@ -643,29 +641,29 @@ end
         by(df, :a, (:c => sum,)) ==
         by(df, :a, [:c => sum]) ==
         by(df, :a, c_sum = :c => sum) ==
-        by(d -> (c_sum=sum(d.c),), df, :a)
+        by(d -> (c_sum=sum(d.c),), df, :a) ==
         by(df, :a, d -> (c_sum=sum(d.c),))
 
     @test by(:c => vexp, df, :a) ==
         by(df, :a, :c => vexp) ==
         by(df, :a, (:c => vexp,)) ==
         by(df, :a, [:c => vexp]) ==
         by(df, :a, c_function = :c => vexp) ==
-        by(d -> (c_function=vexp(d.c),), df, :a)
+        by(d -> (c_function=vexp(d.c),), df, :a) ==
         by(df, :a, d -> (c_function=vexp(d.c),))
 
     @test by(df, :a, :b => sum, :c => sum) ==
         by(df, :a, (:b => sum, :c => sum,)) ==
         by(df, :a, [:b => sum, :c => sum]) ==
         by(df, :a, b_sum = :b => sum, c_sum = :c => sum) ==
-        by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a)
+        by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) ==
         by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))
 
     @test by(df, :a, :b => vexp, :c => identity) ==
         by(df, :a, (:b => vexp, :c => identity,)) ==
         by(df, :a, [:b => vexp, :c => identity]) ==
         by(df, :a, b_function = :b => vexp, c_identity = :c => identity) ==
-        by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a)
+        by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) ==
         by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c)))
 
     gd = groupby(df, :a)
@@ -689,7 +687,7 @@ end
         combine(gd, c_function = :c => vexp) ==
         combine(:c => x -> (c_function=exp.(x),), gd) ==
         combine(gd, :c => x -> (c_function=exp.(x),)) ==
-        combine(d -> (c_function=exp.(d.c),), gd)
+        combine(d -> (c_function=exp.(d.c),), gd) ==
         combine(gd, d -> (c_function=exp.(d.c),))
 
     @test combine(gd, :b => sum, :c => sum) ==
@@ -1484,4 +1482,40 @@ end
     @test by(sdf -> sdf[1, [3,2,1]], df, :a) == df[1:2:5, [1,3,2]]
 end
 
+@testset "Allow returning DataFrame() or NamedTuple() to drop group" begin
+    N = 4
+    for (i, x1) in enumerate(collect.(Iterators.product(repeat([[true, false]], N)...))),
+        er in (DataFrame(), view(DataFrame(ones(2,2)), 2:1, 2:1),
+               view(DataFrame(ones(2,2)), 1:2, 2:1),
+               NamedTuple(), rand(0,0), rand(5,0),
+               DataFrame(x1=Int[]), DataFrame(x1=Any[]),
+               (x1=Int[],), (x1=Any[],), rand(0,1)),
+        fr in (DataFrame(x1=[true]), (x1=[true],), hcat(true), [true])
+        df = DataFrame(a = 1:N, x1 = x1)
+        res = by(sdf -> sdf.x1[1] ? fr : er, df, :a)
+        @test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a)))
+        if fr == [true]
+            if df.x1[1]
+                @test rename(res, 2=>:x1_function) == by(df, :a, :x1 => x -> x[1] ? fr : er)
+            else
+                @test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
+            end
+        else
+            @test res == by(df, :a, :x1 => x -> x[1] ? fr : er)
+        end
+        @test res == by(df, :a, (:a, :x1) => x -> x.x1[1] ? fr : er)
+        if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1)
+            @test res == DataFrame(a=[])
+            @test typeof(res.a) == Vector{Int}
+        else
+            @test res == df[df.x1, :]
+        end
+        if 1 < i < 2^N
+            @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a)
+            @test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a)
+            @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a)
+        end
+    end
+end
+
 end # module