make sorting of row groups optional

by default no sorting is applied to preserve original ordering (the initial order of the 1st rows is preserved) and make things faster
JuliaData · Aug 30, 2015 · 8802e10 · 8802e10
1 parent f3d54c1
commit 8802e10
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 22 deletions.
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -37,6 +37,7 @@ groupby(cols)
 
 * `d` : an AbstractDataFrame
 * `cols` : data frame columns to group by
+* `sort`: sort row groups (no sorting by default)
 
 If `d` is not provided, a curried version of groupby is given.
 
@@ -81,20 +82,24 @@ df |> groupby([:a, :b]) |> [sum, length]
 ```
 
 """
-function groupby{T}(df::AbstractDataFrame, cols::Vector{T})
+function groupby{T}(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false)
     sdf = df[cols]
     df_groups = _group_rows(sdf)
     # sort the groups
-    group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
+    if sort
+      group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
+      permute!(df_groups.starts, group_perm)
+      permute!(df_groups.stops, group_perm)
+    end
     GroupedDataFrame(df, cols, df_groups.rperm,
-                     df_groups.starts[group_perm],
-                     df_groups.stops[group_perm])
+                     df_groups.starts,
+                     df_groups.stops)
 end
-groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])
+groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
 
 # add a function curry
-groupby{T}(cols::Vector{T}) = x -> groupby(x, cols)
-groupby(cols) = x -> groupby(x, cols)
+groupby{T}(cols::Vector{T}; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
+groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
 
 Base.start(gd::GroupedDataFrame) = 1
 Base.next(gd::GroupedDataFrame, state::Int) =
@@ -241,8 +246,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
 based on columns `col`
 
 ```julia
-by(d::AbstractDataFrame, cols, f::Function)
-by(f::Function, d::AbstractDataFrame, cols)
+by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false)
+by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false)
 ```
 
 ### Arguments
@@ -251,6 +256,7 @@ by(f::Function, d::AbstractDataFrame, cols)
 * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
 * `f` : a function to be applied to groups; expects each argument to
   be an AbstractDataFrame
+* `sort`: sort row groups (no sorting by default)
 
 `f` can return a value, a vector, or a DataFrame. For a value or
 vector, these are merged into a column along with the `cols` keys. For
@@ -281,8 +287,10 @@ end
 ```
 
 """
-by(d::AbstractDataFrame, cols, f::Function) = combine(map(f, groupby(d, cols)))
-by(f::Function, d::AbstractDataFrame, cols) = by(d, cols, f)
+by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) =
+  combine(map(f, groupby(d, cols, sort = sort)))
+by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) =
+  by(d, cols, f, sort = sort)
 
 #
 # Aggregate convenience functions
@@ -342,8 +350,9 @@ Base.(:|>)(gd::GroupedDataFrame, fs::Vector{Function}) = aggregate(gd, fs)
 # Groups DataFrame by cols before applying aggregate
 function aggregate{T <: ColumnIndex}(d::AbstractDataFrame,
                                      cols::Union(T, AbstractVector{T}),
-                                     fs::Union(Function, Vector{Function}))
-    aggregate(groupby(d, cols), fs)
+                                     fs::Union(Function, Vector{Function});
+                                     sort::Bool = false)
+    aggregate(groupby(d, cols, sort = sort), fs)
 end
 
 function _makeheaders(fs::Vector{Function}, cn::Vector{Symbol})

diff --git a/test/data.jl b/test/data.jl
@@ -102,18 +102,16 @@ module TestData
     df8 = aggregate(df7[[1, 3]], sum)
     @test df8[1, :d1_sum] == sum(df7[:d1])
 
-    df8 = aggregate(df7, :d2, [sum, length])
+    df8 = aggregate(df7, :d2, [sum, length], sort=true)
     @test df8[1:2, :d2] == ["A", "B"]
     @test size(df8, 1) == 3
     @test size(df8, 2) == 5
     @test sum(df8[:d1_length]) == N
     @test all(df8[:d1_length] .> 0)
     @test df8[:d1_length] == [4, 5, 11]
-    @test isequal(df8, aggregate(groupby(df7, :d2), [sum, length]))
+    @test isequal(df8, aggregate(groupby(df7, :d2, sort=true), [sum, length]))
 
-    df9 = df7 |> groupby([:d2]) |> [sum, length]
-    @test isequal(df9, df8)
-    df9 = aggregate(df7, :d2, [sum, length])
+    df9 = df7 |> groupby([:d2], sort=true) |> [sum, length]
     @test isequal(df9, df8)
 
     df10 = DataFrame(

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -2,29 +2,39 @@ module TestGrouping
     using Base.Test
     using DataFrames
 
-    df = DataFrame(a=rep(1:4, 2), b=rep(2:-1:1, 4), c=randn(8))
+    df = DataFrame(a=rep(4:-1:1, 2), b=rep(1:2, 4), c=randn(8))
     #df[6, :a] = NA
     #df[7, :b] = NA
 
     cols = [:a, :b]
 
     f(df) = DataFrame(cmax = maximum(df[:c]))
 
-    sdf = sort(df, cols=cols)
+    sdf = unique(df[cols])
+
+    # by() without groups sorting
     bdf = by(df, cols, f)
+    @test bdf[cols] == sdf
 
-    @test bdf[cols] == unique(sdf[cols])
+    # by() with groups sorting
+    sbdf = by(df, cols, f, sort=true)
+    @test sbdf[cols] == sort(sdf)
 
     byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b])))
 
     @test all(T -> T <: AbstractVector, map(typeof, colwise([sum], df)))
     @test all(T -> T <: AbstractVector, map(typeof, colwise(sum, df)))
 
+    # groupby() without groups sorting
     gd = groupby(df, cols)
     ga = map(f, gd)
-
     @test bdf == combine(ga)
 
+    # groupby() with groups sorting
+    gd = groupby(df, cols, sort=true)
+    ga = map(f, gd)
+    @test sbdf == combine(ga)
+
     g(df) = DataFrame(cmax1 = df[:cmax] + 1)
     h(df) = g(f(df))