Reinstate combine(), change map() to return a GroupedDataFrame

JuliaData · Sep 29, 2018 · 256bd1d · 256bd1d
1 parent 69ae02b
commit 256bd1d
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 40 deletions.
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -14,8 +14,10 @@ Pages = ["functions.md"]
 aggregate
 by
 colwise
+combine
 groupby
 join
+map
 melt
 stack
 unstack
@@ -27,7 +29,6 @@ meltdf
 
 ```@docs
 allowmissing!
-map
 completecases
 describe
 disallowmissing!

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -28,6 +28,7 @@ export AbstractDataFrame,
        by,
        categorical!,
        colwise,
+       combine,
        completecases,
        deleterows!,
        describe,

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -1382,6 +1382,3 @@ end
 
 import Base: map
 @deprecate map(f::Function, sdf::SubDataFrame) f(sdf)
-
-@deprecate combine(f::Function, gd::GroupedDataFrame) map(f, gd)
-@deprecate combine(gd::GroupedDataFrame) map(identity, gd)
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -50,14 +50,17 @@ groupby(cols; sort = false, skipmissing = false)
 
 An iterator over a `GroupedDataFrame` returns a `SubDataFrame` view
 for each grouping into `d`. A `GroupedDataFrame` also supports
-indexing by groups and `map` (which applies a function to each group
+indexing by groups, `map` (which applies a function to each group)
+and `combine` (which applies a function to each group
 and combines the result into a data frame).
 
 See the following for additional split-apply-combine operations:
 
 * `by` : split-apply-combine using functions
 * `aggregate` : split-apply-combine; applies functions in the form of a cross product
-* `colwise` : apply a function to each column in an AbstractDataFrame or GroupedDataFrame
+* `colwise` : apply a function to each column in an `AbstractDataFrame` or `GroupedDataFrame`
+* `map` : apply a function to each group of a `GroupedDataFrame` (without combining)
+* `combine` : combine a `GroupedDataFrame`, optionally applying a function to each group
 
 ### Examples
 
@@ -73,6 +76,7 @@ for g in gd
     println(g)
 end
 map(d -> mean(skipmissing(d[:c])), gd)
+combine(d -> mean(skipmissing(d[:c])), gd)
 ```
 
 """
@@ -121,23 +125,77 @@ wrap(s::Union{AbstractVector, Tuple}) = DataFrame(x1 = s)
 wrap(s::Any) = (x1 = s,)
 
 """
-Apply a function to each group of rows and combine the result
+    map(f::Function, gd::GroupedDataFrame)
+
+Apply a function to each group of rows and return a `GroupedDataFrame`.
+
+For each group in `gd`, `f` is passed a `SubDataFrame` view holding the corresponding rows.
+`f` can return a single value, a named tuple, a vector, or a data frame.
+This determines the shape of the resulting data frame:
+- A single value gives a data frame with a single column and one row per group.
+- A named tuple gives a data frame with one column for each field in the named tuple
+  and one row per group.
+- A vector gives a data frame with a single column and as many rows
+  for each group as the length of the returned vector for that group.
+- A data frame gives a data frame with the same columns and as many rows
+  for each group as the rows returned for that group.
+
+In all cases, the resulting `GroupedDataFrame` contains all the grouping columns in addition
+to those listed above. Note that `f` must always return the same type of object for
+all groups, and (if a named tuple or data frame) with the same fields or columns.
+Returning a single value or a named tuple is significantly faster than
+returning a vector or a data frame.
+
+### Examples
 
 ```julia
-map(f::Function, gd::GroupedDataFrame)
+df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
+               b = repeat([2, 1], outer=[4]),
+               c = randn(8))
+gd = groupby(df, :a)
+map(d -> sum(skipmissing(d[:c])), gd)
 ```
 
-### Arguments
+### See also
 
-* `gd` : a `GroupedDataFrame` object
+`combine(f, gd)` returns a `DataFrame` rather than a `GroupedDataFrame`
 
-### Returns
+"""
+function Base.map(f::Function, gd::GroupedDataFrame)
+    if length(gd) > 0
+        idx, valscat = _combine(wrap(f(gd[1])), f, gd)
+        parent = hcat!(gd.parent[idx, gd.cols], valscat)
+        starts = Vector{Int}(undef, length(gd))
+        ends = Vector{Int}(undef, length(gd))
+        starts[1] = 1
+        j = 2
+        @inbounds for i in 2:length(idx)
+            if idx[i] != idx[i-1]
+                starts[j] = i
+                ends[j-1] = i - 1
+                j += 1
+            end
+        end
+        # In case some groups have to be dropped
+        resize!(starts, j-1)
+        resize!(ends, j-1)
+        ends[end] = length(idx)
+        return GroupedDataFrame(parent, gd.cols, collect(1:length(idx)), starts, ends)
+    else
+        return GroupedDataFrame(similar(gd.parent[gd.cols], 0), gd.cols,
+                                Int[], Int[], Int[])
+    end
+end
 
-* `::DataFrame`
+"""
+    combine(gd::GroupedDataFrame)
+    combine(f::Function, gd::GroupedDataFrame)
 
-### Details
+Transform a `GroupedDataFrame` into a `DataFrame`.
+If a function `f` is provided, it is called for each group in `gd` with a `SubDataFrame` view
+holding the corresponding rows, and the returned `DataFrame` then consists of the returned rows
+plus the grouping columns.
 
-For each group in `gd`, `f` is passed a `SubDataFrame` view with the corresponding rows.
 `f` can return a single value, a named tuple, a vector, or a data frame.
 This determines the shape of the resulting data frame:
 - A single value gives a data frame with a single column and one row per group.
@@ -169,18 +227,23 @@ map(d -> sum(skipmissing(d[:c])), gd)
 
 ### See also
 
-`by(f, df, cols)` is a shorthand for `map(f, groupby(df, cols))`.
+[`by(f, df, cols)`](@ref) is a shorthand for `combine(f, groupby(df, cols))`.
+
+[`map`](@ref): `combine(f, groupby(df, cols))` is a more efficient equivalent
+of `combine(map(f, groupby(df, cols)))`.
 
 """
-function Base.map(f::Function, gd::GroupedDataFrame)
+function combine(f::Function, gd::GroupedDataFrame)
     if length(gd) > 0
         idx, valscat = _combine(wrap(f(gd[1])), f, gd)
         return hcat!(gd.parent[idx, gd.cols], valscat)
     else
-        return similar(gd.parent, 0)[gd.cols]
+        return similar(gd.parent[gd.cols], 0)
     end
 end
 
+combine(gd::GroupedDataFrame) = combine(identity, gd)
+
 function _combine(first::NamedTuple, f::Function, gd::GroupedDataFrame)
     m = length(first)
     n = length(gd)
@@ -413,7 +476,7 @@ returning a vector or a data frame.
 A method is defined with `f` as the first argument, so do-block
 notation can be used.
 
-`by(d, cols, f)` is equivalent to `map(f, groupby(d, cols))`.
+`by(d, cols, f)` is equivalent to `combine(f, groupby(d, cols))`.
 
 ### Returns
 
@@ -436,7 +499,7 @@ end
 
 """
 by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) =
-    map(f, groupby(d, cols, sort = sort))
+    combine(f, groupby(d, cols, sort = sort))
 by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) =
     by(d, cols, f, sort = sort)
 
@@ -491,7 +554,7 @@ end
 aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
 function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
     headers = _makeheaders(fs, setdiff(_names(gd), _names(gd.parent[gd.cols])))
-    res = map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)
+    res = combine(x -> _aggregate(without(x, gd.cols), fs, headers), gd)
     sort && sort!(res, headers)
     res
 end

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -102,7 +102,7 @@ module TestGrouping
         sres = sort(res)
 
         # by() without groups sorting
-        @test sort(by(df, cols, identity, sort=true)[[:a, :b, :c]]) == sort(df)
+        @test sort(by(df, cols, identity)[[:a, :b, :c]]) == sort(df)
         @test by(df, cols, f) == res
         @test by(df, cols, g) == res
         @test rename(by(df, cols, h), :x1 => :cmax) == res
@@ -118,21 +118,21 @@ module TestGrouping
 
         # groupby() without groups sorting
         gd = groupby(df, cols)
-        @test sort(map(identity, gd)[[:a, :b, :c]]) == sort(df)
-        @test map(f, gd) == res
-        @test map(g, gd) == res
-        @test rename(map(h, gd), :x1 => :cmax) == res
+        @test sort(combine(identity, gd)[[:a, :b, :c]]) == sort(df)
+        @test combine(f, gd) == res
+        @test combine(g, gd) == res
+        @test rename(combine(h, gd), :x1 => :cmax) == res
 
         # groupby() with groups sorting
         gd = groupby(df, cols, sort=true)
         for i in 1:length(gd)
             @test all(gd[i].a .== sres.a[i])
             @test all(gd[i].b .== sres.b[i])
         end
-        @test map(identity, gd)[[:a, :b, :c]] == sort(df, cols)
-        @test map(f, gd) == sres
-        @test map(g, gd) == sres
-        @test rename(map(h, gd), :x1 => :cmax) == sres
+        @test combine(identity, gd)[[:a, :b, :c]] == sort(df, cols)
+        @test combine(f, gd) == sres
+        @test combine(g, gd) == sres
+        @test rename(combine(h, gd), :x1 => :cmax) == sres
 
         # testing pool overflow
         df2 = DataFrame(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000)))
@@ -178,27 +178,27 @@ module TestGrouping
         @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
 
         # Check that CategoricalArray column is preserved when returning a value...
-        res = map(d -> DataFrame(x=d[1, :Key2]), groupby(df, :Key1))
+        res = combine(d -> DataFrame(x=d[1, :Key2]), groupby(df, :Key1))
         @test res.x isa CategoricalVector{String}
-        res = map(d -> (x=d[1, :Key2],), groupby(df, :Key1))
+        res = combine(d -> (x=d[1, :Key2],), groupby(df, :Key1))
         @test res.x isa CategoricalVector{String}
         # ...and when returning an array
-        res = map(d -> DataFrame(x=d[:Key1]), groupby(df, :Key1))
+        res = combine(d -> DataFrame(x=d[:Key1]), groupby(df, :Key1))
         @test res.x isa CategoricalVector{String}
 
         # Check that CategoricalArray and String give a String...
-        res = map(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
-                  groupby(df, :Key1))
+        res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
+                      groupby(df, :Key1))
         @test res.x isa Vector{String}
-        res = map(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",),
-                  groupby(df, :Key1))
+        res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",),
+                      groupby(df, :Key1))
         @test res.x isa Vector{String}
         # ...even when CategoricalString comes second
-        res = map(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
-                  groupby(df, :Key1))
+        res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
+                      groupby(df, :Key1))
         @test res.x isa Vector{String}
-        res = map(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",),
-                  groupby(df, :Key1))
+        res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",),
+                      groupby(df, :Key1))
         @test res.x isa Vector{String}
 
         df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
@@ -262,6 +262,53 @@ module TestGrouping
         @test res.x isa Vector{Int}
     end
 
+    @testset "map(f, ::GroupedDataFrame)" begin
+        Random.seed!(1)
+        df = DataFrame(a = repeat(Union{Int, Missing}[1, 3, 2, 4], outer=[2]),
+                       b = repeat(Union{Int, Missing}[2, 1], outer=[4]),
+                       c = Vector{Union{Float64, Missing}}(randn(8)))
+
+        cols = [:a, :b]
+        f(df) = DataFrame(cmax = maximum(df[:c]))
+        g(df) = (cmax = maximum(df[:c]),)
+        h(df) = maximum(df[:c])
+
+        res = unique(df[cols])
+        res.cmax = [maximum(df[(df.a .== a) .& (df.b .== b), :c])
+                    for (a, b) in zip(res.a, res.b)]
+        sres = sort(res)
+
+        # map() without groups sorting
+        gd = groupby(df, cols)
+        res = map(d -> d[[:c]], gd)
+        @test length(gd) == length(res)
+        @test res[1] == gd[1] && res[2] == gd[2] && res[3] == gd[3] && res[4] == gd[4]
+        res = map(f, groupby(df, cols))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(f, df, cols)
+        res = map(g, groupby(df, cols))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(g, df, cols)
+        res = map(h, groupby(df, cols))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(df, cols, h)
+
+        # map() with groups sorting
+        gd = groupby(df, cols, sort=true)
+        res = map(d -> d[[:c]], gd)
+        @test length(gd) == length(res)
+        @test res[1] == gd[1] && res[2] == gd[2] && res[3] == gd[3] && res[4] == gd[4]
+        res = map(f, groupby(df, cols, sort=true))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(f, df, cols, sort=true)
+        res = map(g, groupby(df, cols, sort=true))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(g, df, cols, sort=true)
+        res = map(h, groupby(df, cols, sort=true))
+        @test vcat(res[1], res[2], res[3], res[4]) == by(df, cols, h, sort=true)
+
+        # Test with some groups returning empty data frames
+        df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
+        res = map(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby(df, :x))
+        @test length(res) == 2
+        @test vcat(res[1], res[2]) == DataFrame(x=[2, 3], z=[1, 1])
+    end
+
     @testset "grouping with missings" begin
         global df = DataFrame(Key1 = ["A", missing, "B", "B", "A"],
                               Key2 = CategoricalArray(["B", "A", "A", missing, "A"]),