Skip to content

Commit

Permalink
Add pivot_table plus various other fixes related to reshaping (issue #29
Browse files Browse the repository at this point in the history
).
  • Loading branch information
powerdistribution committed Jan 24, 2013
1 parent a74a47c commit bb5b59b
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 33 deletions.
3 changes: 2 additions & 1 deletion src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ export # reconcile_groups,
RepeatedVector,
EachRepeatedVector,
melt,
melt_df
melt_df,
pivot_table

##############################################################################
##
Expand Down
8 changes: 4 additions & 4 deletions src/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1766,18 +1766,18 @@ end

# Iteration by rows
type DFRowIterator
df::DataFrame
df::AbstractDataFrame
end
EachRow(df::DataFrame) = DFRowIterator(df)
EachRow(df::AbstractDataFrame) = DFRowIterator(df)
start(itr::DFRowIterator) = 1
done(itr::DFRowIterator, i::Int) = i > nrow(itr.df)
next(itr::DFRowIterator, i::Int) = (itr.df[i, :], i + 1)

# Iteration by columns
type DFColumnIterator
df::DataFrame
df::AbstractDataFrame
end
EachCol(df::DataFrame) = DFColumnIterator(df)
EachCol(df::AbstractDataFrame) = DFColumnIterator(df)
start(itr::DFColumnIterator) = 1
done(itr::DFColumnIterator, j::Int) = j > ncol(itr.df)
next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1)
Expand Down
25 changes: 17 additions & 8 deletions src/extras.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ const LETTERS = convert(Vector{ASCIIString}, split("ABCDEFGHIJKLMNOPQRSTUVWXYZ",
# Like string(s), but preserves Vector{String} and converts
# Vector{Any} to Vector{String}.
_vstring{T <: String}(s::T) = s
_vstring{T <: String}(s::Vector{T}) = s
_vstring(s::Vector) = map(_vstring, s)
_vstring{T <: String}(s::AbstractVector{T}) = s
_vstring(s::AbstractVector) = String[_vstring(x) for x in s]
_vstring(s::Any) = string(s)

function paste{T<:String}(s::Vector{T}...)
function paste(s...)
s = map(vcat * _vstring, {s...})
sa = {s...}
N = max(length, sa)
res = fill("", N)
Expand All @@ -50,12 +51,20 @@ function paste{T<:String}(s::Vector{T}...)
end
res
end
# The following converts all arguments to Vector{<:String} before
# calling paste.
function paste(s...)
converted = map(vcat * _vstring, {s...})
paste(converted...)

function paste_columns(d::AbstractDataFrame, sep)
res = fill("", nrow(d))
for j in 1:ncol(d)
for i in 1:nrow(d)
res[i] *= string(d[i,j])
if j != ncol(d)
res[i] *= sep
end
end
end
res
end
paste_columns(d::AbstractDataFrame) = paste_columns(d, "_")

##############################################################################
##
Expand Down
2 changes: 1 addition & 1 deletion src/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Split - Apply - Combine operations
#

function groupsort_indexer(x::Vector, ngroups::Integer)
function groupsort_indexer(x::AbstractVector, ngroups::Int)
## translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

## count group sizes, location 0 for NA
Expand Down
2 changes: 1 addition & 1 deletion src/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ end

# Need assign()'s to make this work
function show(io::IO, pda::PooledDataArray)
invoke(show, (Any, AbstractArray), io, pda)
invoke(show, (IO, AbstractArray), io, pda)
print(io, "\nlevels: ")
print(io, levels(pda))
end
Expand Down
92 changes: 80 additions & 12 deletions src/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,44 @@
##
## Reshaping
##
## Also, see issue # ??
##
##############################################################################

##############################################################################
##
## stack()
## melt()
##
##############################################################################

function stack(df::DataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
remainingcols = _setdiff([1:ncol(df)], measure_vars)
res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "key") for i in measure_vars]...)
res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "variable") for i in measure_vars]...)
replace_names!(res, colnames(res)[2], "value")
res
end
stack(df::DataFrame, measure_vars, id_vars) = stack(df, [df.colindex[measure_vars]], [df.colindex[id_vars]])
stack(df::DataFrame, measure_vars) = stack(df, [df.colindex[measure_vars]], _setdiff([1:ncol(df)], [df.colindex[measure_vars]]))

melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], df.colindex[id_vars]))
melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], [df.colindex[id_vars]]))
melt(df::DataFrame, id_vars, measure_vars) = stack(df, measure_vars, id_vars)

function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int)
keycol = PooledDataArray(df[ikey])
valuecol = df[ivalue]
##############################################################################
##
## unstack()
##
##############################################################################

function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
# `rowkey` integer indicating which column to place along rows
# `colkey` integer indicating which column to place along column headers
# `value` integer indicating which column has values
keycol = PooledDataArray(df[rowkey])
valuecol = df[value]
# TODO make a version with a default refkeycol
refkeycol = PooledDataArray(df[irefkey])
remainingcols = _setdiff([1:ncol(df)], [ikey, ivalue])
refkeycol = PooledDataArray(df[colkey])
remainingcols = _setdiff([1:ncol(df)], [rowkey, value])
Nrow = length(refkeycol.pool)
Ncol = length(keycol.pool)
# TODO make fillNA(type, length)
Expand All @@ -38,10 +56,50 @@ function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int)
payload[j][i] = valuecol[k]
end
end
insert!(payload, 1, refkeycol.pool, colnames(df)[irefkey])
insert!(payload, 1, refkeycol.pool, colnames(df)[colkey])
end
unstack(df::AbstractDataFrame, rowkey, value, colkey) =
unstack(df, index(df)[rowkey], index(df)[value], index(df)[colkey])

##############################################################################
##
## pivot_table()
##
##############################################################################

# Limitations:
# - only one `value` column is allowed (same as dcast)
# - `fun` must reduce to one value
# - no margins
# - can't have zero rows or zero columns
# - the resulting data part is Float64 (`payload` below)

function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function)
# `rows` vector indicating which columns are keys placed in rows
# `cols` vector indicating which columns are keys placed as column headers
# `value` integer indicating which column has values
# `fun` function applied to the value column during aggregation
cmb_df = by(df, [rows, cols], d->fun(d[value]))
row_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows):-1:1]])) # the :-1: is to reverse the columns for sorting
row_idxs = int(row_pdv.refs)
Nrow = length(row_pdv.pool)
col_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows) + (1:length(cols))]]))
col_idxs = int(col_pdv.refs)
Ncol = length(col_pdv.pool)
# `payload` is the main "data holding" part of the resulting DataFrame
payload = DataFrame(Nrow, Ncol)
colnames!(payload, col_pdv.pool)
for i in 1:length(row_idxs)
payload[row_idxs[i], col_idxs[i]] = cmb_df[i,"x1"]
end
# find the "row" key DataFrame
g = groupby(cmb_df[[1:length(rows)]], [1:length(rows)])
row_key_df = g.parent[g.idx[g.starts],:]
cbind(row_key_df, payload)
end
unstack(df::DataFrame, ikey, ivalue, irefkey) =
unstack(df, df.colindex[ikey], df.colindex[ivalue], df.colindex[irefkey])
# `mean` is the default aggregation function:
pivot_table(df::AbstractDataFrame, rows, cols, value) = pivot_table(df, rows, cols, value, mean)
pivot_table(df::AbstractDataFrame, rows, cols, value, fun) = pivot_table(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun)


##############################################################################
Expand Down Expand Up @@ -102,6 +160,8 @@ eltype(v::StackedVector) = eltype(v.components[1])
show(io::IO, v::StackedVector) = internal_show_vector(io, v)
repl_show(io::IO, v::StackedVector) = internal_repl_show_vector(io, v)

PooledDataArray(v::StackedVector) = PooledDataArray(v[:])

function ref{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I})
j = mod(i - 1, length(v.parent)) + 1
v.parent[j]
Expand All @@ -120,6 +180,9 @@ eltype{T}(v::RepeatedVector{T}) = T
show(io::IO, v::RepeatedVector) = internal_show_vector(io, v)
repl_show(io::IO, v::RepeatedVector) = internal_repl_show_vector(io, v)

unique(v::RepeatedVector) = unique(v.parent)

PooledDataArray(v::RepeatedVector) = [fill(PooledDataArray(v.parent), v.n)...]

function ref{T}(v::EachRepeatedVector{T},i::Real)
j = div(i - 1, v.n) + 1
Expand All @@ -139,6 +202,10 @@ eltype{T}(v::EachRepeatedVector{T}) = T
show(io::IO, v::EachRepeatedVector) = internal_show_vector(io, v)
repl_show(io::IO, v::EachRepeatedVector) = internal_repl_show_vector(io, v)

unique(v::EachRepeatedVector) = unique(v.parent)

PooledDataArray(v::EachRepeatedVector) = PooledDataArray(a[:], unique(v.parent))


# The default values of show and repl_show don't work because
# both try to reshape the vector into a matrix, and these
Expand All @@ -165,6 +232,7 @@ end
##############################################################################
##
## stack_df()
## melt_df()
## Reshaping using referencing (issue #145), using the above vector types
##
##############################################################################
Expand All @@ -176,8 +244,8 @@ function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vec
remainingcols = _setdiff([1:ncol(df)], measure_vars)
names = colnames(df)[id_vars]
insert!(names, 1, "value")
insert!(names, 1, "key")
DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # key
insert!(names, 1, "variable")
DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # variable
StackedVector({df[:,c] for c in 1:N}), # value
## RepeatedVector([1:nrow(df)], N), # idx - do we want this?
[RepeatedVector(df[:,c], N) for c in id_vars]...}, # id_var columns
Expand Down
27 changes: 21 additions & 6 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -390,22 +390,37 @@ d1s = stack(d1, ["a", "b"])
d1s2 = stack(d1, ["c", "d"])
@assert isequal(d1s[1:12, "c"], d1["c"])
@assert isequal(d1s[13:24, "c"], d1["c"])
@assert all(colnames(d1s) .== ["key", "value", "c", "d"])
@assert all(colnames(d1s) .== ["variable", "value", "c", "d"])
d1s_df = stack_df(d1, ["a", "b"])
@assert isequal(d1s["key"], d1s_df["key"][:])
@assert isequal(d1s["variable"], d1s_df["variable"][:])
@assert isequal(d1s["value"], d1s_df["value"][:])
@assert isequal(d1s["c"], d1s_df["c"][:])
@assert isequal(d1s[1,:], d1s_df[1,:])



d1s["idx"] = [1:12, 1:12]
d1s2["idx"] = [1:12, 1:12]
d1us = unstack(d1s, "key", "value", "idx")
d1us2 = unstack(d1s2, "key", "value", "idx")
d1us = unstack(d1s, "variable", "idx", "value")
d1us2 = unstack(d1s2, "variable", "idx", "value")
@assert isequal(d1us["a"], d1["a"])
@assert isequal(d1us2["d"], d1["d"])

d = DataFrame(quote
a = letters[5:8]
b = LETTERS[10:11]
c = LETTERS[13 + [1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2]]
d = pi * [1:14]
end)

dpv = pivot_table(d, ["a", "b"], "c", "d")
@assert( dpv[1,"O"] == d[5,"d"])
@assert( nrow(dpv) == 4 )

dpv2 = pivot_table(d, ["a"], ["c", "b"], "d")
@assert( dpv2[1,"O_J"] == d[5,"d"])

dpv3 = pivot_table(d, ["a"], ["c", "b"], "d", length)
@assert( dpv3[1,"O_J"] == 1.0)

test_group("merge")

srand(1)
Expand Down

0 comments on commit bb5b59b

Please sign in to comment.