Skip to content

Commit

Permalink
Reinstate combine(), change map() to return a GroupedDataFrame
Browse files Browse the repository at this point in the history
  • Loading branch information
nalimilan committed Sep 29, 2018
1 parent 69ae02b commit 256bd1d
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 40 deletions.
3 changes: 2 additions & 1 deletion docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ Pages = ["functions.md"]
aggregate
by
colwise
combine
groupby
join
map
melt
stack
unstack
Expand All @@ -27,7 +29,6 @@ meltdf

```@docs
allowmissing!
map
completecases
describe
disallowmissing!
Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export AbstractDataFrame,
by,
categorical!,
colwise,
combine,
completecases,
deleterows!,
describe,
Expand Down
3 changes: 0 additions & 3 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,3 @@ end

import Base: map
@deprecate map(f::Function, sdf::SubDataFrame) f(sdf)

@deprecate combine(f::Function, gd::GroupedDataFrame) map(f, gd)
@deprecate combine(gd::GroupedDataFrame) map(identity, gd)
95 changes: 79 additions & 16 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,17 @@ groupby(cols; sort = false, skipmissing = false)
An iterator over a `GroupedDataFrame` returns a `SubDataFrame` view
for each grouping into `d`. A `GroupedDataFrame` also supports
indexing by groups and `map` (which applies a function to each group
indexing by groups, `map` (which applies a function to each group)
and `combine` (which applies a function to each group
and combines the result into a data frame).
See the following for additional split-apply-combine operations:
* `by` : split-apply-combine using functions
* `aggregate` : split-apply-combine; applies functions in the form of a cross product
* `colwise` : apply a function to each column in an AbstractDataFrame or GroupedDataFrame
* `colwise` : apply a function to each column in an `AbstractDataFrame` or `GroupedDataFrame`
* `map` : apply a function to each group of a `GroupedDataFrame` (without combining)
* `combine` : combine a `GroupedDataFrame`, optionally applying a function to each group
### Examples
Expand All @@ -73,6 +76,7 @@ for g in gd
println(g)
end
map(d -> mean(skipmissing(d[:c])), gd)
combine(d -> mean(skipmissing(d[:c])), gd)
```
"""
Expand Down Expand Up @@ -121,23 +125,77 @@ wrap(s::Union{AbstractVector, Tuple}) = DataFrame(x1 = s)
wrap(s::Any) = (x1 = s,)

"""
Apply a function to each group of rows and combine the result
map(f::Function, gd::GroupedDataFrame)
Apply a function to each group of rows and return a `GroupedDataFrame`.
For each group in `gd`, `f` is passed a `SubDataFrame` view holding the corresponding rows.
`f` can return a single value, a named tuple, a vector, or a data frame.
This determines the shape of the resulting data frame:
- A single value gives a data frame with a single column and one row per group.
- A named tuple gives a data frame with one column for each field in the named tuple
and one row per group.
- A vector gives a data frame with a single column and as many rows
for each group as the length of the returned vector for that group.
- A data frame gives a data frame with the same columns and as many rows
for each group as the rows returned for that group.
In all cases, the resulting `GroupedDataFrame` contains all the grouping columns in addition
to those listed above. Note that `f` must always return the same type of object for
all groups, and (if a named tuple or data frame) with the same fields or columns.
Returning a single value or a named tuple is significantly faster than
returning a vector or a data frame.
### Examples
```julia
map(f::Function, gd::GroupedDataFrame)
df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
gd = groupby(df, :a)
map(d -> sum(skipmissing(d[:c])), gd)
```
### Arguments
### See also
* `gd` : a `GroupedDataFrame` object
`combine(f, gd)` returns a `DataFrame` rather than a `GroupedDataFrame`
### Returns
"""
function Base.map(f::Function, gd::GroupedDataFrame)
if length(gd) > 0
idx, valscat = _combine(wrap(f(gd[1])), f, gd)
parent = hcat!(gd.parent[idx, gd.cols], valscat)
starts = Vector{Int}(undef, length(gd))
ends = Vector{Int}(undef, length(gd))
starts[1] = 1
j = 2
@inbounds for i in 2:length(idx)
if idx[i] != idx[i-1]
starts[j] = i
ends[j-1] = i - 1
j += 1
end
end
# In case some groups have to be dropped
resize!(starts, j-1)
resize!(ends, j-1)
ends[end] = length(idx)
return GroupedDataFrame(parent, gd.cols, collect(1:length(idx)), starts, ends)
else
return GroupedDataFrame(similar(gd.parent[gd.cols], 0), gd.cols,
Int[], Int[], Int[])
end
end

* `::DataFrame`
"""
combine(gd::GroupedDataFrame)
combine(f::Function, gd::GroupedDataFrame)
### Details
Transform a `GroupedDataFrame` into a `DataFrame`.
If a function `f` is provided, it is called for each group in `gd` with a `SubDataFrame` view
holding the corresponding rows, and the returned `DataFrame` then consists of the returned rows
plus the grouping columns.
For each group in `gd`, `f` is passed a `SubDataFrame` view with the corresponding rows.
`f` can return a single value, a named tuple, a vector, or a data frame.
This determines the shape of the resulting data frame:
- A single value gives a data frame with a single column and one row per group.
Expand Down Expand Up @@ -169,18 +227,23 @@ map(d -> sum(skipmissing(d[:c])), gd)
### See also
`by(f, df, cols)` is a shorthand for `map(f, groupby(df, cols))`.
[`by(f, df, cols)`](@ref) is a shorthand for `combine(f, groupby(df, cols))`.
[`map`](@ref): `combine(f, groupby(df, cols))` is a more efficient equivalent
of `combine(map(f, groupby(df, cols)))`.
"""
function Base.map(f::Function, gd::GroupedDataFrame)
function combine(f::Function, gd::GroupedDataFrame)
if length(gd) > 0
idx, valscat = _combine(wrap(f(gd[1])), f, gd)
return hcat!(gd.parent[idx, gd.cols], valscat)
else
return similar(gd.parent, 0)[gd.cols]
return similar(gd.parent[gd.cols], 0)
end
end

combine(gd::GroupedDataFrame) = combine(identity, gd)

function _combine(first::NamedTuple, f::Function, gd::GroupedDataFrame)
m = length(first)
n = length(gd)
Expand Down Expand Up @@ -413,7 +476,7 @@ returning a vector or a data frame.
A method is defined with `f` as the first argument, so do-block
notation can be used.
`by(d, cols, f)` is equivalent to `map(f, groupby(d, cols))`.
`by(d, cols, f)` is equivalent to `combine(f, groupby(d, cols))`.
### Returns
Expand All @@ -436,7 +499,7 @@ end
"""
by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) =
map(f, groupby(d, cols, sort = sort))
combine(f, groupby(d, cols, sort = sort))
by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) =
by(d, cols, f, sort = sort)

Expand Down Expand Up @@ -491,7 +554,7 @@ end
aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
headers = _makeheaders(fs, setdiff(_names(gd), _names(gd.parent[gd.cols])))
res = map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)
res = combine(x -> _aggregate(without(x, gd.cols), fs, headers), gd)
sort && sort!(res, headers)
res
end
Expand Down
87 changes: 67 additions & 20 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ module TestGrouping
sres = sort(res)

# by() without groups sorting
@test sort(by(df, cols, identity, sort=true)[[:a, :b, :c]]) == sort(df)
@test sort(by(df, cols, identity)[[:a, :b, :c]]) == sort(df)
@test by(df, cols, f) == res
@test by(df, cols, g) == res
@test rename(by(df, cols, h), :x1 => :cmax) == res
Expand All @@ -118,21 +118,21 @@ module TestGrouping

# groupby() without groups sorting
gd = groupby(df, cols)
@test sort(map(identity, gd)[[:a, :b, :c]]) == sort(df)
@test map(f, gd) == res
@test map(g, gd) == res
@test rename(map(h, gd), :x1 => :cmax) == res
@test sort(combine(identity, gd)[[:a, :b, :c]]) == sort(df)
@test combine(f, gd) == res
@test combine(g, gd) == res
@test rename(combine(h, gd), :x1 => :cmax) == res

# groupby() with groups sorting
gd = groupby(df, cols, sort=true)
for i in 1:length(gd)
@test all(gd[i].a .== sres.a[i])
@test all(gd[i].b .== sres.b[i])
end
@test map(identity, gd)[[:a, :b, :c]] == sort(df, cols)
@test map(f, gd) == sres
@test map(g, gd) == sres
@test rename(map(h, gd), :x1 => :cmax) == sres
@test combine(identity, gd)[[:a, :b, :c]] == sort(df, cols)
@test combine(f, gd) == sres
@test combine(g, gd) == sres
@test rename(combine(h, gd), :x1 => :cmax) == sres

# testing pool overflow
df2 = DataFrame(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000)))
Expand Down Expand Up @@ -178,27 +178,27 @@ module TestGrouping
@test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)

# Check that CategoricalArray column is preserved when returning a value...
res = map(d -> DataFrame(x=d[1, :Key2]), groupby(df, :Key1))
res = combine(d -> DataFrame(x=d[1, :Key2]), groupby(df, :Key1))
@test res.x isa CategoricalVector{String}
res = map(d -> (x=d[1, :Key2],), groupby(df, :Key1))
res = combine(d -> (x=d[1, :Key2],), groupby(df, :Key1))
@test res.x isa CategoricalVector{String}
# ...and when returning an array
res = map(d -> DataFrame(x=d[:Key1]), groupby(df, :Key1))
res = combine(d -> DataFrame(x=d[:Key1]), groupby(df, :Key1))
@test res.x isa CategoricalVector{String}

# Check that CategoricalArray and String give a String...
res = map(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
groupby(df, :Key1))
res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
groupby(df, :Key1))
@test res.x isa Vector{String}
res = map(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",),
groupby(df, :Key1))
res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",),
groupby(df, :Key1))
@test res.x isa Vector{String}
# ...even when CategoricalString comes second
res = map(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
groupby(df, :Key1))
res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
groupby(df, :Key1))
@test res.x isa Vector{String}
res = map(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",),
groupby(df, :Key1))
res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",),
groupby(df, :Key1))
@test res.x isa Vector{String}

df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
Expand Down Expand Up @@ -262,6 +262,53 @@ module TestGrouping
@test res.x isa Vector{Int}
end

@testset "map(f, ::GroupedDataFrame)" begin
Random.seed!(1)
df = DataFrame(a = repeat(Union{Int, Missing}[1, 3, 2, 4], outer=[2]),
b = repeat(Union{Int, Missing}[2, 1], outer=[4]),
c = Vector{Union{Float64, Missing}}(randn(8)))

cols = [:a, :b]
f(df) = DataFrame(cmax = maximum(df[:c]))
g(df) = (cmax = maximum(df[:c]),)
h(df) = maximum(df[:c])

res = unique(df[cols])
res.cmax = [maximum(df[(df.a .== a) .& (df.b .== b), :c])
for (a, b) in zip(res.a, res.b)]
sres = sort(res)

# map() without groups sorting
gd = groupby(df, cols)
res = map(d -> d[[:c]], gd)
@test length(gd) == length(res)
@test res[1] == gd[1] && res[2] == gd[2] && res[3] == gd[3] && res[4] == gd[4]
res = map(f, groupby(df, cols))
@test vcat(res[1], res[2], res[3], res[4]) == by(f, df, cols)
res = map(g, groupby(df, cols))
@test vcat(res[1], res[2], res[3], res[4]) == by(g, df, cols)
res = map(h, groupby(df, cols))
@test vcat(res[1], res[2], res[3], res[4]) == by(df, cols, h)

# map() with groups sorting
gd = groupby(df, cols, sort=true)
res = map(d -> d[[:c]], gd)
@test length(gd) == length(res)
@test res[1] == gd[1] && res[2] == gd[2] && res[3] == gd[3] && res[4] == gd[4]
res = map(f, groupby(df, cols, sort=true))
@test vcat(res[1], res[2], res[3], res[4]) == by(f, df, cols, sort=true)
res = map(g, groupby(df, cols, sort=true))
@test vcat(res[1], res[2], res[3], res[4]) == by(g, df, cols, sort=true)
res = map(h, groupby(df, cols, sort=true))
@test vcat(res[1], res[2], res[3], res[4]) == by(df, cols, h, sort=true)

# Test with some groups returning empty data frames
df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
res = map(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby(df, :x))
@test length(res) == 2
@test vcat(res[1], res[2]) == DataFrame(x=[2, 3], z=[1, 1])
end

@testset "grouping with missings" begin
global df = DataFrame(Key1 = ["A", missing, "B", "B", "A"],
Key2 = CategoricalArray(["B", "A", "A", missing, "A"]),
Expand Down

0 comments on commit 256bd1d

Please sign in to comment.