diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 8d7b6d6b7c..1c10ffcb86 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -36,7 +36,7 @@ groupby(cols) ### Arguments * `d` : an AbstractDataFrame -* `cols` : an +* `cols` : data frame columns to group by If `d` is not provided, a curried version of groupby is given. @@ -82,32 +82,19 @@ df |> groupby([:a, :b]) |> [sum, length] """ function groupby{T}(d::AbstractDataFrame, cols::Vector{T}) - ## a subset of Wes McKinney's algorithm here: - ## http://wesmckinney.com/blog/?p=489 - ncols = length(cols) - # use the pool trick to get a set of integer references for each unique item - dv = PooledDataArray(d[cols[ncols]]) - # if there are NAs, add 1 to the refs to avoid underflows in x later - dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0) - x = copy(dv.refs) .+ dv_has_nas - # also compute the number of groups, which is the product of the set lengths - ngroups = length(dv.pool) + dv_has_nas - # if there's more than 1 column, do roughly the same thing repeatedly - for j = (ncols - 1):-1:1 - dv = PooledDataArray(d[cols[j]]) - dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0) - for i = 1:nrow(d) - x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups - end - ngroups = ngroups * (length(dv.pool) + dv_has_nas) - # TODO if ngroups is really big, shrink it + d_groups = _group_rows(d[cols]) + # sort the groups + d_group_keys = sort!(collect(keys(d_groups))) + # generate permutation that arranges rows by groups + idx = sizehint!(@compat(Vector{Int}()), nrow(d)) + starts = sizehint!(@compat(Vector{Int}()), length(d_groups)) + for gr_row in d_group_keys + push!(starts, length(idx)+1) + append!(idx, d_groups[gr_row]) end - (idx, starts) = DataArrays.groupsort_indexer(x, ngroups) - # Remove zero-length groupings - starts = _uniqueofsorted(starts) - ends = starts[2:end] - 1 - GroupedDataFrame(d, cols, idx, starts[1:end-1], ends) + ends = push!(starts[2:end] - 1, length(idx)) + GroupedDataFrame(d, cols, idx, starts, ends) end groupby(d::AbstractDataFrame, cols) = groupby(d, [cols]) @@ -284,7 +271,7 @@ notation can be used. ### Returns -* `::DataFrame` +* `::DataFrame` ### Examples @@ -330,7 +317,7 @@ same length. ### Returns -* `::DataFrame` +* `::DataFrame` ### Examples diff --git a/src/other/utils.jl b/src/other/utils.jl index 696a62191f..7391315008 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -162,19 +162,6 @@ function _setdiff{T}(a::AbstractVector{T}, b::T) diff end -function _uniqueofsorted(x::Vector) - idx = fill(true, length(x)) - lastx = x[1] - for i = 2:length(x) - if lastx == x[i] - idx[i] = false - else - lastx = x[i] - end - end - x[idx] -end - # Gets the name of a function. Used in groupedataframe/grouping.jl function _fnames(fs::Vector{Function}) λcounter = 0 @@ -188,4 +175,4 @@ function _fnames(fs::Vector{Function}) name end names -end \ No newline at end of file +end