Add pivot_table plus various other fixes related to reshaping (issue #29

).
JuliaData · Jan 24, 2013 · bb5b59b · bb5b59b
1 parent a74a47c
commit bb5b59b
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 33 deletions.
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -191,7 +191,8 @@ export # reconcile_groups,
        RepeatedVector,
        EachRepeatedVector,
        melt,
-       melt_df
+       melt_df,
+       pivot_table
 
 ##############################################################################
 ##

diff --git a/src/dataframe.jl b/src/dataframe.jl
@@ -1766,18 +1766,18 @@ end
 
 # Iteration by rows
 type DFRowIterator
-    df::DataFrame
+    df::AbstractDataFrame
 end
-EachRow(df::DataFrame) = DFRowIterator(df)
+EachRow(df::AbstractDataFrame) = DFRowIterator(df)
 start(itr::DFRowIterator) = 1
 done(itr::DFRowIterator, i::Int) = i > nrow(itr.df)
 next(itr::DFRowIterator, i::Int) = (itr.df[i, :], i + 1)
 
 # Iteration by columns
 type DFColumnIterator
-    df::DataFrame
+    df::AbstractDataFrame
 end
-EachCol(df::DataFrame) = DFColumnIterator(df)
+EachCol(df::AbstractDataFrame) = DFColumnIterator(df)
 start(itr::DFColumnIterator) = 1
 done(itr::DFColumnIterator, j::Int) = j > ncol(itr.df)
 next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1)

diff --git a/src/extras.jl b/src/extras.jl
@@ -28,11 +28,12 @@ const LETTERS = convert(Vector{ASCIIString}, split("ABCDEFGHIJKLMNOPQRSTUVWXYZ",
 # Like string(s), but preserves Vector{String} and converts
 # Vector{Any} to Vector{String}.
 _vstring{T <: String}(s::T) = s
-_vstring{T <: String}(s::Vector{T}) = s
-_vstring(s::Vector) = map(_vstring, s)
+_vstring{T <: String}(s::AbstractVector{T}) = s
+_vstring(s::AbstractVector) = String[_vstring(x) for x in s]
 _vstring(s::Any) = string(s)
 
-function paste{T<:String}(s::Vector{T}...)
+function paste(s...)
+    s = map(vcat * _vstring, {s...})
     sa = {s...}
     N = max(length, sa)
     res = fill("", N)
@@ -50,12 +51,20 @@ function paste{T<:String}(s::Vector{T}...)
     end
     res
 end
-# The following converts all arguments to Vector{<:String} before
-# calling paste.
-function paste(s...)
-    converted = map(vcat * _vstring, {s...})
-    paste(converted...)
+
+function paste_columns(d::AbstractDataFrame, sep)
+    res = fill("", nrow(d))
+    for j in 1:ncol(d)
+        for i in 1:nrow(d)
+            res[i] *= string(d[i,j])
+            if j != ncol(d)
+                res[i] *= sep
+            end
+        end
+    end
+    res
 end
+paste_columns(d::AbstractDataFrame) = paste_columns(d, "_")
 
 ##############################################################################
 ##

diff --git a/src/grouping.jl b/src/grouping.jl
@@ -2,7 +2,7 @@
 #  Split - Apply - Combine operations
 #
 
-function groupsort_indexer(x::Vector, ngroups::Integer)
+function groupsort_indexer(x::AbstractVector, ngroups::Int)
     ## translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
 
     ## count group sizes, location 0 for NA

diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl
@@ -528,7 +528,7 @@ end
 
 # Need assign()'s to make this work
 function show(io::IO, pda::PooledDataArray)
-    invoke(show, (Any, AbstractArray), io, pda)
+    invoke(show, (IO, AbstractArray), io, pda)
     print(io, "\nlevels: ")
     print(io, levels(pda))
 end

diff --git a/src/reshape.jl b/src/reshape.jl
@@ -2,26 +2,44 @@
 ##
 ## Reshaping
 ##
+## Also, see issue # ??
+##
+##############################################################################
+
+##############################################################################
+##
+## stack()
+## melt()
+##
 ##############################################################################
 
 function stack(df::DataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
     remainingcols = _setdiff([1:ncol(df)], measure_vars)
-    res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "key") for i in measure_vars]...)
+    res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "variable") for i in measure_vars]...)
     replace_names!(res, colnames(res)[2], "value")
     res 
 end
 stack(df::DataFrame, measure_vars, id_vars) = stack(df, [df.colindex[measure_vars]], [df.colindex[id_vars]])
 stack(df::DataFrame, measure_vars) = stack(df, [df.colindex[measure_vars]], _setdiff([1:ncol(df)], [df.colindex[measure_vars]]))
 
-melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], df.colindex[id_vars]))
+melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], [df.colindex[id_vars]]))
 melt(df::DataFrame, id_vars, measure_vars) = stack(df, measure_vars, id_vars)
 
-function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int)
-    keycol = PooledDataArray(df[ikey])
-    valuecol = df[ivalue]
+##############################################################################
+##
+## unstack()
+##
+##############################################################################
+
+function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
+    # `rowkey` integer indicating which column to place along rows
+    # `colkey` integer indicating which column to place along column headers
+    # `value` integer indicating which column has values
+    keycol = PooledDataArray(df[rowkey])
+    valuecol = df[value]
     # TODO make a version with a default refkeycol
-    refkeycol = PooledDataArray(df[irefkey])
-    remainingcols = _setdiff([1:ncol(df)], [ikey, ivalue])
+    refkeycol = PooledDataArray(df[colkey])
+    remainingcols = _setdiff([1:ncol(df)], [rowkey, value])
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
     # TODO make fillNA(type, length) 
@@ -38,10 +56,50 @@ function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int)
             payload[j][i]  = valuecol[k]
         end
     end
-    insert!(payload, 1, refkeycol.pool, colnames(df)[irefkey])
+    insert!(payload, 1, refkeycol.pool, colnames(df)[colkey])
+end
+unstack(df::AbstractDataFrame, rowkey, value, colkey) =
+    unstack(df, index(df)[rowkey], index(df)[value], index(df)[colkey])
+
+##############################################################################
+##
+## pivot_table()
+##
+##############################################################################
+
+# Limitations:
+#  - only one `value` column is allowed (same as dcast)
+#  - `fun` must reduce to one value
+#  - no margins
+#  - can't have zero rows or zero columns
+#  - the resulting data part is Float64 (`payload` below) 
+
+function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function)
+    # `rows` vector indicating which columns are keys placed in rows
+    # `cols` vector indicating which columns are keys placed as column headers
+    # `value` integer indicating which column has values
+    # `fun` function applied to the value column during aggregation
+    cmb_df = by(df, [rows, cols], d->fun(d[value]))
+    row_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows):-1:1]]))  # the :-1: is to reverse the columns for sorting
+    row_idxs = int(row_pdv.refs)
+    Nrow = length(row_pdv.pool)
+    col_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows) + (1:length(cols))]]))
+    col_idxs = int(col_pdv.refs)
+    Ncol = length(col_pdv.pool)
+    # `payload` is the main "data holding" part of the resulting DataFrame
+    payload = DataFrame(Nrow, Ncol)
+    colnames!(payload, col_pdv.pool)
+    for i in 1:length(row_idxs)
+        payload[row_idxs[i], col_idxs[i]] = cmb_df[i,"x1"]
+    end
+    # find the "row" key DataFrame
+    g = groupby(cmb_df[[1:length(rows)]], [1:length(rows)])
+    row_key_df = g.parent[g.idx[g.starts],:]
+    cbind(row_key_df, payload)
 end
-unstack(df::DataFrame, ikey, ivalue, irefkey) =
-    unstack(df, df.colindex[ikey], df.colindex[ivalue], df.colindex[irefkey])
+# `mean` is the default aggregation function:
+pivot_table(df::AbstractDataFrame, rows, cols, value) = pivot_table(df, rows, cols, value, mean)
+pivot_table(df::AbstractDataFrame, rows, cols, value, fun) = pivot_table(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun)
 
 
 ##############################################################################
@@ -102,6 +160,8 @@ eltype(v::StackedVector) = eltype(v.components[1])
 show(io::IO, v::StackedVector) = internal_show_vector(io, v)
 repl_show(io::IO, v::StackedVector) = internal_repl_show_vector(io, v)
 
+PooledDataArray(v::StackedVector) = PooledDataArray(v[:])
+
 function ref{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I})
     j = mod(i - 1, length(v.parent)) + 1
     v.parent[j]
@@ -120,6 +180,9 @@ eltype{T}(v::RepeatedVector{T}) = T
 show(io::IO, v::RepeatedVector) = internal_show_vector(io, v)
 repl_show(io::IO, v::RepeatedVector) = internal_repl_show_vector(io, v)
 
+unique(v::RepeatedVector) = unique(v.parent)
+
+PooledDataArray(v::RepeatedVector) = [fill(PooledDataArray(v.parent), v.n)...]
 
 function ref{T}(v::EachRepeatedVector{T},i::Real)
     j = div(i - 1, v.n) + 1
@@ -139,6 +202,10 @@ eltype{T}(v::EachRepeatedVector{T}) = T
 show(io::IO, v::EachRepeatedVector) = internal_show_vector(io, v)
 repl_show(io::IO, v::EachRepeatedVector) = internal_repl_show_vector(io, v)
 
+unique(v::EachRepeatedVector) = unique(v.parent)
+
+PooledDataArray(v::EachRepeatedVector) = PooledDataArray(a[:], unique(v.parent))
+
 
 # The default values of show and repl_show don't work because
 # both try to reshape the vector into a matrix, and these
@@ -165,6 +232,7 @@ end
 ##############################################################################
 ##
 ## stack_df()
+## melt_df()
 ## Reshaping using referencing (issue #145), using the above vector types
 ##
 ##############################################################################
@@ -176,8 +244,8 @@ function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vec
     remainingcols = _setdiff([1:ncol(df)], measure_vars)
     names = colnames(df)[id_vars]
     insert!(names, 1, "value")
-    insert!(names, 1, "key")
-    DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # key
+    insert!(names, 1, "variable")
+    DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # variable
                StackedVector({df[:,c] for c in 1:N}),                    # value
                ## RepeatedVector([1:nrow(df)], N),                       # idx - do we want this?
                [RepeatedVector(df[:,c], N) for c in id_vars]...},        # id_var columns

diff --git a/test/data.jl b/test/data.jl
@@ -390,22 +390,37 @@ d1s = stack(d1, ["a", "b"])
 d1s2 = stack(d1, ["c", "d"])
 @assert isequal(d1s[1:12, "c"], d1["c"])
 @assert isequal(d1s[13:24, "c"], d1["c"])
-@assert all(colnames(d1s) .== ["key", "value", "c", "d"])
+@assert all(colnames(d1s) .== ["variable", "value", "c", "d"])
 d1s_df = stack_df(d1, ["a", "b"])
-@assert isequal(d1s["key"], d1s_df["key"][:])
+@assert isequal(d1s["variable"], d1s_df["variable"][:])
 @assert isequal(d1s["value"], d1s_df["value"][:])
 @assert isequal(d1s["c"], d1s_df["c"][:])
 @assert isequal(d1s[1,:], d1s_df[1,:])
 
-
-
 d1s["idx"] = [1:12, 1:12]
 d1s2["idx"] = [1:12, 1:12]
-d1us = unstack(d1s, "key", "value", "idx")
-d1us2 = unstack(d1s2, "key", "value", "idx")
+d1us = unstack(d1s, "variable", "idx", "value")
+d1us2 = unstack(d1s2, "variable", "idx", "value")
 @assert isequal(d1us["a"], d1["a"])
 @assert isequal(d1us2["d"], d1["d"])
 
+d = DataFrame(quote
+    a = letters[5:8]
+    b = LETTERS[10:11]
+    c = LETTERS[13 + [1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2]]
+    d = pi * [1:14]
+end)
+
+dpv = pivot_table(d, ["a", "b"], "c", "d")
+@assert( dpv[1,"O"] == d[5,"d"])
+@assert( nrow(dpv) == 4 )
+
+dpv2 = pivot_table(d, ["a"], ["c", "b"], "d")
+@assert( dpv2[1,"O_J"] == d[5,"d"])
+
+dpv3 = pivot_table(d, ["a"], ["c", "b"], "d", length)
+@assert( dpv3[1,"O_J"] == 1.0)
+
 test_group("merge")
 
 srand(1)