diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 340210c3e5..56ce05ead8 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -191,7 +191,8 @@ export # reconcile_groups, RepeatedVector, EachRepeatedVector, melt, - melt_df + melt_df, + pivot_table ############################################################################## ## diff --git a/src/dataframe.jl b/src/dataframe.jl index 4602083ad3..9eecf3a305 100644 --- a/src/dataframe.jl +++ b/src/dataframe.jl @@ -1766,18 +1766,18 @@ end # Iteration by rows type DFRowIterator - df::DataFrame + df::AbstractDataFrame end -EachRow(df::DataFrame) = DFRowIterator(df) +EachRow(df::AbstractDataFrame) = DFRowIterator(df) start(itr::DFRowIterator) = 1 done(itr::DFRowIterator, i::Int) = i > nrow(itr.df) next(itr::DFRowIterator, i::Int) = (itr.df[i, :], i + 1) # Iteration by columns type DFColumnIterator - df::DataFrame + df::AbstractDataFrame end -EachCol(df::DataFrame) = DFColumnIterator(df) +EachCol(df::AbstractDataFrame) = DFColumnIterator(df) start(itr::DFColumnIterator) = 1 done(itr::DFColumnIterator, j::Int) = j > ncol(itr.df) next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1) diff --git a/src/extras.jl b/src/extras.jl index f8c8038b1b..727c6acb03 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -28,11 +28,12 @@ const LETTERS = convert(Vector{ASCIIString}, split("ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Like string(s), but preserves Vector{String} and converts # Vector{Any} to Vector{String}. _vstring{T <: String}(s::T) = s -_vstring{T <: String}(s::Vector{T}) = s -_vstring(s::Vector) = map(_vstring, s) +_vstring{T <: String}(s::AbstractVector{T}) = s +_vstring(s::AbstractVector) = String[_vstring(x) for x in s] _vstring(s::Any) = string(s) -function paste{T<:String}(s::Vector{T}...) +function paste(s...) + s = map(vcat * _vstring, {s...}) sa = {s...} N = max(length, sa) res = fill("", N) @@ -50,12 +51,20 @@ function paste{T<:String}(s::Vector{T}...) end res end -# The following converts all arguments to Vector{<:String} before -# calling paste. -function paste(s...) - converted = map(vcat * _vstring, {s...}) - paste(converted...) + +function paste_columns(d::AbstractDataFrame, sep) + res = fill("", nrow(d)) + for j in 1:ncol(d) + for i in 1:nrow(d) + res[i] *= string(d[i,j]) + if j != ncol(d) + res[i] *= sep + end + end + end + res end +paste_columns(d::AbstractDataFrame) = paste_columns(d, "_") ############################################################################## ## diff --git a/src/grouping.jl b/src/grouping.jl index 2b7bdfe924..f3f5f887af 100644 --- a/src/grouping.jl +++ b/src/grouping.jl @@ -2,7 +2,7 @@ # Split - Apply - Combine operations # -function groupsort_indexer(x::Vector, ngroups::Integer) +function groupsort_indexer(x::AbstractVector, ngroups::Int) ## translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). ## count group sizes, location 0 for NA diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index f68acdb3f5..f95146a2e3 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -528,7 +528,7 @@ end # Need assign()'s to make this work function show(io::IO, pda::PooledDataArray) - invoke(show, (Any, AbstractArray), io, pda) + invoke(show, (IO, AbstractArray), io, pda) print(io, "\nlevels: ") print(io, levels(pda)) end diff --git a/src/reshape.jl b/src/reshape.jl index 13ac05234b..b61f2160e3 100644 --- a/src/reshape.jl +++ b/src/reshape.jl @@ -2,26 +2,44 @@ ## ## Reshaping ## +## Also, see issue # ?? +## +############################################################################## + +############################################################################## +## +## stack() +## melt() +## ############################################################################## function stack(df::DataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int}) remainingcols = _setdiff([1:ncol(df)], measure_vars) - res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "key") for i in measure_vars]...) + res = rbind([insert!(df[[i, id_vars]], 1, colnames(df)[i], "variable") for i in measure_vars]...) replace_names!(res, colnames(res)[2], "value") res end stack(df::DataFrame, measure_vars, id_vars) = stack(df, [df.colindex[measure_vars]], [df.colindex[id_vars]]) stack(df::DataFrame, measure_vars) = stack(df, [df.colindex[measure_vars]], _setdiff([1:ncol(df)], [df.colindex[measure_vars]])) -melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], df.colindex[id_vars])) +melt(df::DataFrame, id_vars) = stack(df, _setdiff([1:ncol(df)], [df.colindex[id_vars]])) melt(df::DataFrame, id_vars, measure_vars) = stack(df, measure_vars, id_vars) -function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int) - keycol = PooledDataArray(df[ikey]) - valuecol = df[ivalue] +############################################################################## +## +## unstack() +## +############################################################################## + +function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int) + # `rowkey` integer indicating which column to place along rows + # `colkey` integer indicating which column to place along column headers + # `value` integer indicating which column has values + keycol = PooledDataArray(df[rowkey]) + valuecol = df[value] # TODO make a version with a default refkeycol - refkeycol = PooledDataArray(df[irefkey]) - remainingcols = _setdiff([1:ncol(df)], [ikey, ivalue]) + refkeycol = PooledDataArray(df[colkey]) + remainingcols = _setdiff([1:ncol(df)], [rowkey, value]) Nrow = length(refkeycol.pool) Ncol = length(keycol.pool) # TODO make fillNA(type, length) @@ -38,10 +56,50 @@ function unstack(df::DataFrame, ikey::Int, ivalue::Int, irefkey::Int) payload[j][i] = valuecol[k] end end - insert!(payload, 1, refkeycol.pool, colnames(df)[irefkey]) + insert!(payload, 1, refkeycol.pool, colnames(df)[colkey]) +end +unstack(df::AbstractDataFrame, rowkey, value, colkey) = + unstack(df, index(df)[rowkey], index(df)[value], index(df)[colkey]) + +############################################################################## +## +## pivot_table() +## +############################################################################## + +# Limitations: +# - only one `value` column is allowed (same as dcast) +# - `fun` must reduce to one value +# - no margins +# - can't have zero rows or zero columns +# - the resulting data part is Float64 (`payload` below) + +function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function) + # `rows` vector indicating which columns are keys placed in rows + # `cols` vector indicating which columns are keys placed as column headers + # `value` integer indicating which column has values + # `fun` function applied to the value column during aggregation + cmb_df = by(df, [rows, cols], d->fun(d[value])) + row_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows):-1:1]])) # the :-1: is to reverse the columns for sorting + row_idxs = int(row_pdv.refs) + Nrow = length(row_pdv.pool) + col_pdv = PooledDataArray(paste_columns(cmb_df[[length(rows) + (1:length(cols))]])) + col_idxs = int(col_pdv.refs) + Ncol = length(col_pdv.pool) + # `payload` is the main "data holding" part of the resulting DataFrame + payload = DataFrame(Nrow, Ncol) + colnames!(payload, col_pdv.pool) + for i in 1:length(row_idxs) + payload[row_idxs[i], col_idxs[i]] = cmb_df[i,"x1"] + end + # find the "row" key DataFrame + g = groupby(cmb_df[[1:length(rows)]], [1:length(rows)]) + row_key_df = g.parent[g.idx[g.starts],:] + cbind(row_key_df, payload) end -unstack(df::DataFrame, ikey, ivalue, irefkey) = - unstack(df, df.colindex[ikey], df.colindex[ivalue], df.colindex[irefkey]) +# `mean` is the default aggregation function: +pivot_table(df::AbstractDataFrame, rows, cols, value) = pivot_table(df, rows, cols, value, mean) +pivot_table(df::AbstractDataFrame, rows, cols, value, fun) = pivot_table(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun) ############################################################################## @@ -102,6 +160,8 @@ eltype(v::StackedVector) = eltype(v.components[1]) show(io::IO, v::StackedVector) = internal_show_vector(io, v) repl_show(io::IO, v::StackedVector) = internal_repl_show_vector(io, v) +PooledDataArray(v::StackedVector) = PooledDataArray(v[:]) + function ref{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I}) j = mod(i - 1, length(v.parent)) + 1 v.parent[j] @@ -120,6 +180,9 @@ eltype{T}(v::RepeatedVector{T}) = T show(io::IO, v::RepeatedVector) = internal_show_vector(io, v) repl_show(io::IO, v::RepeatedVector) = internal_repl_show_vector(io, v) +unique(v::RepeatedVector) = unique(v.parent) + +PooledDataArray(v::RepeatedVector) = [fill(PooledDataArray(v.parent), v.n)...] function ref{T}(v::EachRepeatedVector{T},i::Real) j = div(i - 1, v.n) + 1 @@ -139,6 +202,10 @@ eltype{T}(v::EachRepeatedVector{T}) = T show(io::IO, v::EachRepeatedVector) = internal_show_vector(io, v) repl_show(io::IO, v::EachRepeatedVector) = internal_repl_show_vector(io, v) +unique(v::EachRepeatedVector) = unique(v.parent) + +PooledDataArray(v::EachRepeatedVector) = PooledDataArray(a[:], unique(v.parent)) + # The default values of show and repl_show don't work because # both try to reshape the vector into a matrix, and these @@ -165,6 +232,7 @@ end ############################################################################## ## ## stack_df() +## melt_df() ## Reshaping using referencing (issue #145), using the above vector types ## ############################################################################## @@ -176,8 +244,8 @@ function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vec remainingcols = _setdiff([1:ncol(df)], measure_vars) names = colnames(df)[id_vars] insert!(names, 1, "value") - insert!(names, 1, "key") - DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # key + insert!(names, 1, "variable") + DataFrame({EachRepeatedVector(colnames(df)[measure_vars], nrow(df)), # variable StackedVector({df[:,c] for c in 1:N}), # value ## RepeatedVector([1:nrow(df)], N), # idx - do we want this? [RepeatedVector(df[:,c], N) for c in id_vars]...}, # id_var columns diff --git a/test/data.jl b/test/data.jl index 966a417201..59098a0912 100644 --- a/test/data.jl +++ b/test/data.jl @@ -390,22 +390,37 @@ d1s = stack(d1, ["a", "b"]) d1s2 = stack(d1, ["c", "d"]) @assert isequal(d1s[1:12, "c"], d1["c"]) @assert isequal(d1s[13:24, "c"], d1["c"]) -@assert all(colnames(d1s) .== ["key", "value", "c", "d"]) +@assert all(colnames(d1s) .== ["variable", "value", "c", "d"]) d1s_df = stack_df(d1, ["a", "b"]) -@assert isequal(d1s["key"], d1s_df["key"][:]) +@assert isequal(d1s["variable"], d1s_df["variable"][:]) @assert isequal(d1s["value"], d1s_df["value"][:]) @assert isequal(d1s["c"], d1s_df["c"][:]) @assert isequal(d1s[1,:], d1s_df[1,:]) - - d1s["idx"] = [1:12, 1:12] d1s2["idx"] = [1:12, 1:12] -d1us = unstack(d1s, "key", "value", "idx") -d1us2 = unstack(d1s2, "key", "value", "idx") +d1us = unstack(d1s, "variable", "idx", "value") +d1us2 = unstack(d1s2, "variable", "idx", "value") @assert isequal(d1us["a"], d1["a"]) @assert isequal(d1us2["d"], d1["d"]) +d = DataFrame(quote + a = letters[5:8] + b = LETTERS[10:11] + c = LETTERS[13 + [1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2]] + d = pi * [1:14] +end) + +dpv = pivot_table(d, ["a", "b"], "c", "d") +@assert( dpv[1,"O"] == d[5,"d"]) +@assert( nrow(dpv) == 4 ) + +dpv2 = pivot_table(d, ["a"], ["c", "b"], "d") +@assert( dpv2[1,"O_J"] == d[5,"d"]) + +dpv3 = pivot_table(d, ["a"], ["c", "b"], "d", length) +@assert( dpv3[1,"O_J"] == 1.0) + test_group("merge") srand(1)