Skip to content

Commit

Permalink
make sorting of row groups optional
Browse files Browse the repository at this point in the history
by default no sorting is applied to preserve original ordering
(the initial order of the 1st rows is preserved) and make things faster
  • Loading branch information
alyst committed Aug 30, 2015
1 parent f3d54c1 commit 8802e10
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 22 deletions.
35 changes: 22 additions & 13 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ groupby(cols)
* `d` : an AbstractDataFrame
* `cols` : data frame columns to group by
* `sort`: sort row groups (no sorting by default)
If `d` is not provided, a curried version of groupby is given.
Expand Down Expand Up @@ -81,20 +82,24 @@ df |> groupby([:a, :b]) |> [sum, length]
```
"""
function groupby{T}(df::AbstractDataFrame, cols::Vector{T})
function groupby{T}(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false)
sdf = df[cols]
df_groups = _group_rows(sdf)
# sort the groups
group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
if sort
group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
permute!(df_groups.starts, group_perm)
permute!(df_groups.stops, group_perm)
end
GroupedDataFrame(df, cols, df_groups.rperm,
df_groups.starts[group_perm],
df_groups.stops[group_perm])
df_groups.starts,
df_groups.stops)
end
groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])
groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)

# add a function curry
groupby{T}(cols::Vector{T}) = x -> groupby(x, cols)
groupby(cols) = x -> groupby(x, cols)
groupby{T}(cols::Vector{T}; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)

Base.start(gd::GroupedDataFrame) = 1
Base.next(gd::GroupedDataFrame, state::Int) =
Expand Down Expand Up @@ -241,8 +246,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
based on columns `col`
```julia
by(d::AbstractDataFrame, cols, f::Function)
by(f::Function, d::AbstractDataFrame, cols)
by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false)
by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false)
```
### Arguments
Expand All @@ -251,6 +256,7 @@ by(f::Function, d::AbstractDataFrame, cols)
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
* `f` : a function to be applied to groups; expects each argument to
be an AbstractDataFrame
* `sort`: sort row groups (no sorting by default)
`f` can return a value, a vector, or a DataFrame. For a value or
vector, these are merged into a column along with the `cols` keys. For
Expand Down Expand Up @@ -281,8 +287,10 @@ end
```
"""
by(d::AbstractDataFrame, cols, f::Function) = combine(map(f, groupby(d, cols)))
by(f::Function, d::AbstractDataFrame, cols) = by(d, cols, f)
by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) =
combine(map(f, groupby(d, cols, sort = sort)))
by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) =
by(d, cols, f, sort = sort)

#
# Aggregate convenience functions
Expand Down Expand Up @@ -342,8 +350,9 @@ Base.(:|>)(gd::GroupedDataFrame, fs::Vector{Function}) = aggregate(gd, fs)
# Groups DataFrame by cols before applying aggregate
function aggregate{T <: ColumnIndex}(d::AbstractDataFrame,
cols::Union(T, AbstractVector{T}),
fs::Union(Function, Vector{Function}))
aggregate(groupby(d, cols), fs)
fs::Union(Function, Vector{Function});
sort::Bool = false)
aggregate(groupby(d, cols, sort = sort), fs)
end

function _makeheaders(fs::Vector{Function}, cn::Vector{Symbol})
Expand Down
8 changes: 3 additions & 5 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,16 @@ module TestData
df8 = aggregate(df7[[1, 3]], sum)
@test df8[1, :d1_sum] == sum(df7[:d1])

df8 = aggregate(df7, :d2, [sum, length])
df8 = aggregate(df7, :d2, [sum, length], sort=true)
@test df8[1:2, :d2] == ["A", "B"]
@test size(df8, 1) == 3
@test size(df8, 2) == 5
@test sum(df8[:d1_length]) == N
@test all(df8[:d1_length] .> 0)
@test df8[:d1_length] == [4, 5, 11]
@test isequal(df8, aggregate(groupby(df7, :d2), [sum, length]))
@test isequal(df8, aggregate(groupby(df7, :d2, sort=true), [sum, length]))

df9 = df7 |> groupby([:d2]) |> [sum, length]
@test isequal(df9, df8)
df9 = aggregate(df7, :d2, [sum, length])
df9 = df7 |> groupby([:d2], sort=true) |> [sum, length]
@test isequal(df9, df8)

df10 = DataFrame(
Expand Down
18 changes: 14 additions & 4 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,39 @@ module TestGrouping
using Base.Test
using DataFrames

df = DataFrame(a=rep(1:4, 2), b=rep(2:-1:1, 4), c=randn(8))
df = DataFrame(a=rep(4:-1:1, 2), b=rep(1:2, 4), c=randn(8))
#df[6, :a] = NA
#df[7, :b] = NA

cols = [:a, :b]

f(df) = DataFrame(cmax = maximum(df[:c]))

sdf = sort(df, cols=cols)
sdf = unique(df[cols])

# by() without groups sorting
bdf = by(df, cols, f)
@test bdf[cols] == sdf

@test bdf[cols] == unique(sdf[cols])
# by() with groups sorting
sbdf = by(df, cols, f, sort=true)
@test sbdf[cols] == sort(sdf)

byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b])))

@test all(T -> T <: AbstractVector, map(typeof, colwise([sum], df)))
@test all(T -> T <: AbstractVector, map(typeof, colwise(sum, df)))

# groupby() without groups sorting
gd = groupby(df, cols)
ga = map(f, gd)

@test bdf == combine(ga)

# groupby() with groups sorting
gd = groupby(df, cols, sort=true)
ga = map(f, gd)
@test sbdf == combine(ga)

g(df) = DataFrame(cmax1 = df[:cmax] + 1)
h(df) = g(f(df))

Expand Down

0 comments on commit 8802e10

Please sign in to comment.