From 9b4a7010e6a3586dddee073bb14a6daaed0934d3 Mon Sep 17 00:00:00 2001 From: garborg Date: Sun, 9 Feb 2014 19:30:47 -0600 Subject: [PATCH] Function renaming and deprecation from #517 --- benchmarks/datamatrix.jl | 2 +- src/DataFrames.jl | 25 +++------- src/dataframe/dataframe.jl | 97 ++++---------------------------------- src/dataframe/io.jl | 2 +- src/dataframe/reshape.jl | 27 +++++------ src/deprecated.jl | 72 +++++++++++++++++++++++++++- test/data.jl | 2 +- test/dataframe.jl | 2 +- test/duplicates.jl | 8 ++-- test/runtests.jl | 3 +- test/{rbind.jl => vcat.jl} | 24 +++++----- 11 files changed, 119 insertions(+), 145 deletions(-) rename test/{rbind.jl => vcat.jl} (67%) diff --git a/benchmarks/datamatrix.jl b/benchmarks/datamatrix.jl index 0d77d5b0c2..2c2b8edc6c 100644 --- a/benchmarks/datamatrix.jl +++ b/benchmarks/datamatrix.jl @@ -27,7 +27,7 @@ df3 = benchmark(f3, 1_000) # TODO: Keep permanent record -printtable(rbind(df1, df2, df3), header=false) +printtable(vcat(df1, df2, df3), header=false) # Compare with R # We're 10x as fast! diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 805d26f638..2504ed8333 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -56,15 +56,11 @@ export @~, combine, complete_cases, complete_cases!, - cut, DataFrame, DataFrameRow, describe, - drop_duplicates!, - duplicated, eachcol, eachrow, - findat, flipud!, flipud, Formula, @@ -73,7 +69,7 @@ export @~, groupby, GroupedDataFrame, interaction_design_matrix, - load_df, + loaddf, matrix, merge, model_response, @@ -81,12 +77,12 @@ export @~, ModelMatrix, names!, ncol, + nonunique, nrow, order, pool, pool!, printtable, - range, rbind, readtable, rename!, @@ -97,25 +93,16 @@ export @~, subset, types, unique, + unique!, unstack, vector, writetable, xtab, xtabs, - stack_df, + stackdf, melt, - melt_df, - pivot_table, - RComplex, # Vector{Complex128} - RInteger, # Vector{Int32} plus BitVector of NA indicators - RLogical, # BitVector of values and BitVector of NA indicators - RNumeric, # Vector{Float64} - RList, # Vector{Any} - RString, # Vector{ASCIIString} - RSymbol, # Symbol stored as an String b/c of embedded '.' - - class, # in the S3 sense of "class" - inherits, + meltdf, + pivottable, read_rda, vecbind diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 771f7c7333..a1f814b9d1 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -902,11 +902,9 @@ end without(df::DataFrame, i::Int) = without(df, [i]) without(df::DataFrame, c::Any) = without(df, df.colindex[c]) -#### cbind, rbind, hcat, vcat -# hcat() is just cbind() -# rbind(df, ...) only accepts data frames. Finds union of columns, maintaining order +#### hcat, vcat +# vcat(df, ...) only accepts data frames. Finds union of columns, maintaining order # of first df. Missing data becomes NAs. -# vcat() is just rbind() # two-argument form, two dfs, references only function Base.hcat(df1::DataFrame, df2::DataFrame) @@ -922,7 +920,6 @@ Base.hcat{T}(df::DataFrame, x::T) = hcat(df, DataFrame({DataArray([x])})) # three-plus-argument form recurses Base.hcat(a::DataFrame, b, c...) = hcat(hcat(a, b), c...) -cbind(args...) = hcat(args...) Base.similar(df::DataFrame, dims) = DataFrame([similar(x, dims) for x in df.columns], names(df)) @@ -938,60 +935,6 @@ nas{T,R}(dv::PooledDataVector{T,R}, dims) = nas(df::DataFrame, dims) = DataFrame([nas(x, dims) for x in df.columns], names(df)) -vecbind_type{T}(::Vector{T}) = Vector{T} -vecbind_type{T<:AbstractVector}(x::T) = Vector{eltype(x)} -vecbind_type{T<:AbstractDataVector}(x::T) = DataVector{eltype(x)} -vecbind_type{T}(::PooledDataVector{T}) = DataVector{T} - -vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{Vector{T2}}) = Array{promote_type(eltype(x), eltype(y)),1} -vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} -vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} -vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{Vector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} -vecbind_promote_type(a, b, c, ds...) = vecbind_promote_type(a, vecbind_promote_type(b, c, ds...)) -vecbind_promote_type(a, b, c) = vecbind_promote_type(a, vecbind_promote_type(b, c)) - -function vecbind_promote_type(a::AbstractVector) - res = None - if isdefined(a, 1) - if length(a) == 1 - return a[1] - else - if isdefined(a, 2) - res = vecbind_promote_type(a[1], a[2]) - else - res = a[1] - end - end - end - for i in 3:length(a) - if isdefined(a, i) - res = vecbind_promote_type(res, a[i]) - end - end - return res -end - -constructor{T}(::Type{Vector{T}}, args...) = Array(T, args...) -constructor{T}(::Type{DataVector{T}}, args...) = DataArray(T, args...) - -function vecbind(xs::AbstractVector...) - V = vecbind_promote_type(map(vecbind_type, {xs...})) - len = sum(length, xs) - res = constructor(V, len) - k = 1 - for i in 1:length(xs) - for j in 1:length(xs[i]) - res[k] = xs[i][j] - k += 1 - end - end - res -end - -function vecbind(xs::PooledDataVector...) - vecbind(map(x -> convert(DataArray, x), xs)...) -end - Base.vcat(df::AbstractDataFrame) = df Base.vcat{T<:AbstractDataFrame}(dfs::Vector{T}) = vcat(dfs...) @@ -1027,8 +970,6 @@ function Base.vcat(dfs::AbstractDataFrame...) res end -rbind(args...) = vcat(args...) - ## ## Miscellaneous ## @@ -1070,7 +1011,7 @@ function DataArrays.DataArray(adf::AbstractDataFrame, return dm end -function duplicated(df::AbstractDataFrame) +function nonunique(df::AbstractDataFrame) # Return a Vector{Bool} indicated whether the row is a duplicate # of a prior row. res = fill(false, nrow(df)) @@ -1085,15 +1026,15 @@ function duplicated(df::AbstractDataFrame) res end -function drop_duplicates!(df::AbstractDataFrame) - deleterows!(df, find(!duplicated(df))) +function unique!(df::AbstractDataFrame) + deleterows!(df, find(!nonunique(df))) end # Unique rows of an AbstractDataFrame. -Base.unique(df::AbstractDataFrame) = df[!duplicated(df), :] +Base.unique(df::AbstractDataFrame) = df[!nonunique(df), :] -function duplicatedkey(df::AbstractDataFrame) - # Here's another (probably a lot faster) way to do `duplicated` +function nonuniquekey(df::AbstractDataFrame) + # Here's another (probably a lot faster) way to do `nonunique` # by grouping on all columns. It will fail if columns cannot be # made into PooledDataVector's. gd = groupby(df, names(df)) @@ -1110,28 +1051,6 @@ function cleannames!(df::DataFrame) return end -function Base.flipud(df::DataFrame) - return df[reverse(1:nrow(df)), :] -end - -function flipud!(df::DataFrame) - df[1:nrow(df), :] = df[reverse(1:nrow(df)), :] - return -end - -# reorder! for factors by specifying a DataFrame -function DataArrays.reorder(fun::Function, x::PooledDataArray, df::AbstractDataFrame) - dfc = copy(df) - dfc["__key__"] = x - gd = by(dfc, "__key__", df -> colwise(fun, without(df, "__key__"))) - idx = sortperm(gd[[2:ncol(gd)]]) - return PooledDataArray(x, dropna(gd[idx,1])) -end -DataArrays.reorder(x::PooledDataArray, df::AbstractDataFrame) = reorder(:mean, x, df) - -DataArrays.reorder(fun::Function, x::PooledDataArray, y::AbstractVector...) = - reorder(fun, x, DataFrame({y...})) - ############################################################################## ## ## Hashing diff --git a/src/dataframe/io.jl b/src/dataframe/io.jl index c1651ec95f..083e1f7bed 100644 --- a/src/dataframe/io.jl +++ b/src/dataframe/io.jl @@ -1079,7 +1079,7 @@ function save(filename::String, df::AbstractDataFrame) return end -function load_df(filename::String) +function loaddf(filename::String) f = open(filename) dd = deserialize(f) close(f) diff --git a/src/dataframe/reshape.jl b/src/dataframe/reshape.jl index 67192b8943..fe65a5aea3 100644 --- a/src/dataframe/reshape.jl +++ b/src/dataframe/reshape.jl @@ -64,7 +64,7 @@ unstack(df::AbstractDataFrame, rowkey, value, colkey) = ############################################################################## ## -## pivot_table() +## pivottable() ## ############################################################################## @@ -75,7 +75,7 @@ unstack(df::AbstractDataFrame, rowkey, value, colkey) = # - can't have zero rows or zero columns # - the resulting data part is Float64 (`payload` below) -function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function) +function pivottable(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function) # `rows` vector indicating which columns are keys placed in rows # `cols` vector indicating which columns are keys placed as column headers # `value` integer indicating which column has values @@ -99,8 +99,8 @@ function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int} hcat(row_key_df, payload) end # `mean` is the default aggregation function: -pivot_table(df::AbstractDataFrame, rows, cols, value) = pivot_table(df, rows, cols, value, mean) -pivot_table(df::AbstractDataFrame, rows, cols, value, fun) = pivot_table(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun) +pivottable(df::AbstractDataFrame, rows, cols, value) = pivottable(df, rows, cols, value, mean) +pivottable(df::AbstractDataFrame, rows, cols, value, fun) = pivottable(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun) ############################################################################## @@ -157,7 +157,6 @@ Base.size(v::StackedVector) = (length(v),) Base.length(v::StackedVector) = sum(map(length, v.components)) Base.ndims(v::StackedVector) = 1 Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...) -vecbind_type(v::StackedVector) = vecbind_promote_type(map(vecbind_type, v.components)...) Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims) Base.show(io::IO, v::StackedVector) = internal_show_vector(io, v) @@ -179,7 +178,6 @@ Base.size(v::RepeatedVector) = (length(v),) Base.length(v::RepeatedVector) = v.n * length(v.parent) Base.ndims(v::RepeatedVector) = 1 Base.eltype{T}(v::RepeatedVector{T}) = T -vecbind_type(v::RepeatedVector) = vecbind_type(v.parent) Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.n) Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) @@ -208,7 +206,6 @@ Base.size(v::EachRepeatedVector) = (length(v),) Base.length(v::EachRepeatedVector) = v.n * length(v.parent) Base.ndims(v::EachRepeatedVector) = 1 Base.eltype{T}(v::EachRepeatedVector{T}) = T -vecbind_type(v::EachRepeatedVector) = vecbind_type(v.parent) Base.reverse(v::EachRepeatedVector) = EachRepeatedVector(reverse(v.parent), v.n) Base.similar(v::EachRepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) @@ -250,15 +247,15 @@ end ############################################################################## ## -## stack_df() -## melt_df() +## stackdf() +## meltdf() ## Reshaping using referencing (issue #145), using the above vector types ## ############################################################################## # Same as `stack`, but uses references # I'm not sure the name is very good -function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int}) +function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int}) N = length(measure_vars) remainingcols = _setdiff([1:ncol(df)], measure_vars) cnames = names(df)[id_vars] @@ -270,10 +267,10 @@ function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vec [RepeatedVector(df[:,c], N) for c in id_vars]...}, # id_var columns cnames) end -stack_df(df::AbstractDataFrame, measure_vars) = stack_df(df, [index(df)[measure_vars]]) +stackdf(df::AbstractDataFrame, measure_vars) = stackdf(df, [index(df)[measure_vars]]) -stack_df(df::AbstractDataFrame, measure_vars, id_vars) = stack_df(df, [index(df)[measure_vars]], [index(df)[id_vars]]) -stack_df(df::AbstractDataFrame, measure_vars) = stack_df(df, [index(df)[measure_vars]], _setdiff([1:ncol(df)], [index(df)[measure_vars]])) +stackdf(df::AbstractDataFrame, measure_vars, id_vars) = stackdf(df, [index(df)[measure_vars]], [index(df)[id_vars]]) +stackdf(df::AbstractDataFrame, measure_vars) = stackdf(df, [index(df)[measure_vars]], _setdiff([1:ncol(df)], [index(df)[measure_vars]])) -melt_df(df::AbstractDataFrame, id_vars) = stack_df(df, _setdiff([1:ncol(df)], index(df)[id_vars])) -melt_df(df::AbstractDataFrame, id_vars, measure_vars) = stack_df(df, measure_vars, id_vars) +meltdf(df::AbstractDataFrame, id_vars) = stackdf(df, _setdiff([1:ncol(df)], index(df)[id_vars])) +meltdf(df::AbstractDataFrame, id_vars, measure_vars) = stackdf(df, measure_vars, id_vars) diff --git a/src/deprecated.jl b/src/deprecated.jl index 74225ce0ae..6fc1e6c183 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -13,6 +13,12 @@ Base.@deprecate coltypes(adf::AbstractDataFrame) types Base.@deprecate EachRow eachrow Base.@deprecate EachCol eachcol Base.@deprecate subset sub +Base.@deprecate drop_duplicates! unique! +Base.@deprecate duplicated nonunique +Base.@deprecate load_df loaddf +Base.@deprecate melt_df meltdf +Base.@deprecate stack_df stackdf +Base.@deprecate pivot_table pivottable function DataFrame(df::DataFrame) depwarn("DataFrame(::DataFrame) is deprecated, use convert(DataFrame, DataFrame) instead", @@ -167,12 +173,12 @@ function DataArrays.reorder(fun::Function, x::PooledDataArray, y::AbstractVector end function Base.flipud(df::DataFrame) - depwarn("flipud(DataFrame)", :flipud) + depwarn("flipud(DataFrame) is deprecated", :flipud) return df[reverse(1:nrow(df)), :] end function flipud!(df::DataFrame) - depwarn("flipud!(DataFrame)", :flipud!) + depwarn("flipud!(DataFrame) is deprecated", :flipud!) df[1:nrow(df), :] = df[reverse(1:nrow(df)), :] return end @@ -185,3 +191,65 @@ function cleannames!(df::DataFrame) return end +# rbind, cbind, vecbind + +rbind(args...) = vcat(args...) + +cbind(args...) = hcat(args...) + +vecbind_type{T}(::Vector{T}) = Vector{T} +vecbind_type{T<:AbstractVector}(x::T) = Vector{eltype(x)} +vecbind_type{T<:AbstractDataVector}(x::T) = DataVector{eltype(x)} +vecbind_type{T}(::PooledDataVector{T}) = DataVector{T} +vecbind_type(v::StackedVector) = vecbind_promote_type(map(vecbind_type, v.components)...) +vecbind_type(v::RepeatedVector) = vecbind_type(v.parent) +vecbind_type(v::EachRepeatedVector) = vecbind_type(v.parent) + +vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{Vector{T2}}) = Array{promote_type(eltype(x), eltype(y)),1} +vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} +vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} +vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{Vector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1} +vecbind_promote_type(a, b, c, ds...) = vecbind_promote_type(a, vecbind_promote_type(b, c, ds...)) +vecbind_promote_type(a, b, c) = vecbind_promote_type(a, vecbind_promote_type(b, c)) + +function vecbind_promote_type(a::AbstractVector) + res = None + if isdefined(a, 1) + if length(a) == 1 + return a[1] + else + if isdefined(a, 2) + res = vecbind_promote_type(a[1], a[2]) + else + res = a[1] + end + end + end + for i in 3:length(a) + if isdefined(a, i) + res = vecbind_promote_type(res, a[i]) + end + end + return res +end + +#constructor{T}(::Type{Vector{T}}, args...) = Array(T, args...) +#constructor{T}(::Type{DataVector{T}}, args...) = DataArray(T, args...) + +function vecbind(xs::AbstractVector...) + V = vecbind_promote_type(map(vecbind_type, {xs...})) + len = sum(length, xs) + res = constructor(V, len) + k = 1 + for i in 1:length(xs) + for j in 1:length(xs[i]) + res[k] = xs[i][j] + k += 1 + end + end + res +end + +function vecbind(xs::PooledDataVector...) + vecbind(map(x -> convert(DataArray, x), xs)...) +end diff --git a/test/data.jl b/test/data.jl index 3f23552ae4..6b5b43db33 100644 --- a/test/data.jl +++ b/test/data.jl @@ -144,7 +144,7 @@ module TestData @test isequal(d1s[13:24, :c], d1[:c]) @test all(names(d1s) .== [:variable, :value, :c, :d]) @test isequal(d1s, d1s3) - d1s_df = stack_df(d1, [:a, :b]) + d1s_df = stackdf(d1, [:a, :b]) @test isequal(d1s[:variable], d1s_df[:variable][:]) @test isequal(d1s[:value], d1s_df[:value][:]) @test isequal(d1s[:c], d1s_df[:c][:]) diff --git a/test/dataframe.jl b/test/dataframe.jl index 7ebf138e81..9fe7cdb289 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -43,7 +43,7 @@ module TestDataFrame # Deleting columns removes any mention from groupings delete!(x, :a) - # @test colnames(x) == [:b] + @test names(x) == [:b] ## del calls ref, which properly deals with groupings z2 = z[:,[1,1,2]] diff --git a/test/duplicates.jl b/test/duplicates.jl index c26c2df0ba..8090e366de 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -4,7 +4,9 @@ module TestDuplicates using DataFrames df = DataFrame(a = [1, 2, 3, 3, 4]) - @assert isequal(duplicated(df), [false, false, false, true, false]) - drop_duplicates!(df) - @assert isequal(df, DataFrame(a = [1, 2, 3, 4])) + udf = DataFrame(a = [1, 2, 3, 4]) + @test isequal(nonunique(df), [false, false, false, true, false]) + @test isequal(udf, unique(df)) + unique!(df) + @test isequal(df, udf) end diff --git a/test/runtests.jl b/test/runtests.jl index 3bfa8726ec..248a21e816 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,8 +17,9 @@ my_tests = ["data.jl", # "indexing.jl", "RDA.jl", "sort.jl", - "join.jl", "grouping.jl", + "join.jl", + "vcat.jl", "iteration.jl", "duplicates.jl", "show.jl"] diff --git a/test/rbind.jl b/test/vcat.jl similarity index 67% rename from test/rbind.jl rename to test/vcat.jl index 510669970e..1f25d564b9 100644 --- a/test/rbind.jl +++ b/test/vcat.jl @@ -1,4 +1,4 @@ -module TestRBind +module TestVcat using Base.Test using DataArrays using DataFrames @@ -14,30 +14,30 @@ module TestRBind df[1, :] = 1 # Assignment of columns - df[1] = datazeros(4) + df[1] = zeros(4) # Broadcasting assignment of columns df[:, 1] = 1 df[1] = 3 df[:x3] = 2 - rbind(null_df) - rbind(null_df, null_df) - rbind(null_df, df) - rbind(df, null_df) - rbind(df, df) - rbind(df, df, df) + vcat(null_df) + vcat(null_df, null_df) + vcat(null_df, df) + vcat(df, null_df) + vcat(df, df) + vcat(df, df, df) alt_df = deepcopy(df) - rbind(df, alt_df) - df[1] = datazeros(Int, nrow(df)) + vcat(df, alt_df) + df[1] = zeros(Int, nrow(df)) # Fail on non-matching types - rbind(df, alt_df) + vcat(df, alt_df) alt_df = deepcopy(df) names!(alt_df, [:A, :B, :C]) # Fail on non-matching names - rbind(df, alt_df) + vcat(df, alt_df) # df[:, 1] = dvzeros(Int, nrow(df)) end