Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Function renaming and deprecation from #517 #538

Merged
merged 1 commit into from
Feb 10, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/datamatrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ df3 = benchmark(f3,
1_000)

# TODO: Keep permanent record
printtable(rbind(df1, df2, df3), header=false)
printtable(vcat(df1, df2, df3), header=false)

# Compare with R
# We're 10x as fast!
Expand Down
25 changes: 6 additions & 19 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,11 @@ export @~,
combine,
complete_cases,
complete_cases!,
cut,
DataFrame,
DataFrameRow,
describe,
drop_duplicates!,
duplicated,
eachcol,
eachrow,
findat,
flipud!,
flipud,
Formula,
Expand All @@ -73,20 +69,20 @@ export @~,
groupby,
GroupedDataFrame,
interaction_design_matrix,
load_df,
loaddf,
matrix,
merge,
model_response,
ModelFrame,
ModelMatrix,
names!,
ncol,
nonunique,
nrow,
order,
pool,
pool!,
printtable,
range,
rbind,
readtable,
rename!,
Expand All @@ -97,25 +93,16 @@ export @~,
subset,
types,
unique,
unique!,
unstack,
vector,
writetable,
xtab,
xtabs,
stack_df,
stackdf,
melt,
melt_df,
pivot_table,
RComplex, # Vector{Complex128}
RInteger, # Vector{Int32} plus BitVector of NA indicators
RLogical, # BitVector of values and BitVector of NA indicators
RNumeric, # Vector{Float64}
RList, # Vector{Any}
RString, # Vector{ASCIIString}
RSymbol, # Symbol stored as an String b/c of embedded '.'

class, # in the S3 sense of "class"
inherits,
meltdf,
pivottable,
read_rda,
vecbind

Expand Down
97 changes: 8 additions & 89 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -902,11 +902,9 @@ end
without(df::DataFrame, i::Int) = without(df, [i])
without(df::DataFrame, c::Any) = without(df, df.colindex[c])

#### cbind, rbind, hcat, vcat
# hcat() is just cbind()
# rbind(df, ...) only accepts data frames. Finds union of columns, maintaining order
#### hcat, vcat
# vcat(df, ...) only accepts data frames. Finds union of columns, maintaining order
# of first df. Missing data becomes NAs.
# vcat() is just rbind()

# two-argument form, two dfs, references only
function Base.hcat(df1::DataFrame, df2::DataFrame)
Expand All @@ -922,7 +920,6 @@ Base.hcat{T}(df::DataFrame, x::T) = hcat(df, DataFrame({DataArray([x])}))

# three-plus-argument form recurses
Base.hcat(a::DataFrame, b, c...) = hcat(hcat(a, b), c...)
cbind(args...) = hcat(args...)

Base.similar(df::DataFrame, dims) =
DataFrame([similar(x, dims) for x in df.columns], names(df))
Expand All @@ -938,60 +935,6 @@ nas{T,R}(dv::PooledDataVector{T,R}, dims) =
nas(df::DataFrame, dims) =
DataFrame([nas(x, dims) for x in df.columns], names(df))

vecbind_type{T}(::Vector{T}) = Vector{T}
vecbind_type{T<:AbstractVector}(x::T) = Vector{eltype(x)}
vecbind_type{T<:AbstractDataVector}(x::T) = DataVector{eltype(x)}
vecbind_type{T}(::PooledDataVector{T}) = DataVector{T}

vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{Vector{T2}}) = Array{promote_type(eltype(x), eltype(y)),1}
vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1}
vecbind_promote_type{T1,T2}(x::Type{Vector{T1}}, y::Type{DataVector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1}
vecbind_promote_type{T1,T2}(x::Type{DataVector{T1}}, y::Type{Vector{T2}}) = DataArray{promote_type(eltype(x), eltype(y)),1}
vecbind_promote_type(a, b, c, ds...) = vecbind_promote_type(a, vecbind_promote_type(b, c, ds...))
vecbind_promote_type(a, b, c) = vecbind_promote_type(a, vecbind_promote_type(b, c))

function vecbind_promote_type(a::AbstractVector)
res = None
if isdefined(a, 1)
if length(a) == 1
return a[1]
else
if isdefined(a, 2)
res = vecbind_promote_type(a[1], a[2])
else
res = a[1]
end
end
end
for i in 3:length(a)
if isdefined(a, i)
res = vecbind_promote_type(res, a[i])
end
end
return res
end

constructor{T}(::Type{Vector{T}}, args...) = Array(T, args...)
constructor{T}(::Type{DataVector{T}}, args...) = DataArray(T, args...)

function vecbind(xs::AbstractVector...)
V = vecbind_promote_type(map(vecbind_type, {xs...}))
len = sum(length, xs)
res = constructor(V, len)
k = 1
for i in 1:length(xs)
for j in 1:length(xs[i])
res[k] = xs[i][j]
k += 1
end
end
res
end

function vecbind(xs::PooledDataVector...)
vecbind(map(x -> convert(DataArray, x), xs)...)
end

Base.vcat(df::AbstractDataFrame) = df

Base.vcat{T<:AbstractDataFrame}(dfs::Vector{T}) = vcat(dfs...)
Expand Down Expand Up @@ -1027,8 +970,6 @@ function Base.vcat(dfs::AbstractDataFrame...)
res
end

rbind(args...) = vcat(args...)

##
## Miscellaneous
##
Expand Down Expand Up @@ -1070,7 +1011,7 @@ function DataArrays.DataArray(adf::AbstractDataFrame,
return dm
end

function duplicated(df::AbstractDataFrame)
function nonunique(df::AbstractDataFrame)
# Return a Vector{Bool} indicated whether the row is a duplicate
# of a prior row.
res = fill(false, nrow(df))
Expand All @@ -1085,15 +1026,15 @@ function duplicated(df::AbstractDataFrame)
res
end

function drop_duplicates!(df::AbstractDataFrame)
deleterows!(df, find(!duplicated(df)))
function unique!(df::AbstractDataFrame)
deleterows!(df, find(!nonunique(df)))
end

# Unique rows of an AbstractDataFrame.
Base.unique(df::AbstractDataFrame) = df[!duplicated(df), :]
Base.unique(df::AbstractDataFrame) = df[!nonunique(df), :]

function duplicatedkey(df::AbstractDataFrame)
# Here's another (probably a lot faster) way to do `duplicated`
function nonuniquekey(df::AbstractDataFrame)
# Here's another (probably a lot faster) way to do `nonunique`
# by grouping on all columns. It will fail if columns cannot be
# made into PooledDataVector's.
gd = groupby(df, names(df))
Expand All @@ -1110,28 +1051,6 @@ function cleannames!(df::DataFrame)
return
end

function Base.flipud(df::DataFrame)
return df[reverse(1:nrow(df)), :]
end

function flipud!(df::DataFrame)
df[1:nrow(df), :] = df[reverse(1:nrow(df)), :]
return
end

# reorder! for factors by specifying a DataFrame
function DataArrays.reorder(fun::Function, x::PooledDataArray, df::AbstractDataFrame)
dfc = copy(df)
dfc["__key__"] = x
gd = by(dfc, "__key__", df -> colwise(fun, without(df, "__key__")))
idx = sortperm(gd[[2:ncol(gd)]])
return PooledDataArray(x, dropna(gd[idx,1]))
end
DataArrays.reorder(x::PooledDataArray, df::AbstractDataFrame) = reorder(:mean, x, df)

DataArrays.reorder(fun::Function, x::PooledDataArray, y::AbstractVector...) =
reorder(fun, x, DataFrame({y...}))

##############################################################################
##
## Hashing
Expand Down
2 changes: 1 addition & 1 deletion src/dataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ function save(filename::String, df::AbstractDataFrame)
return
end

function load_df(filename::String)
function loaddf(filename::String)
f = open(filename)
dd = deserialize(f)
close(f)
Expand Down
27 changes: 12 additions & 15 deletions src/dataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ unstack(df::AbstractDataFrame, rowkey, value, colkey) =

##############################################################################
##
## pivot_table()
## pivottable()
##
##############################################################################

Expand All @@ -75,7 +75,7 @@ unstack(df::AbstractDataFrame, rowkey, value, colkey) =
# - can't have zero rows or zero columns
# - the resulting data part is Float64 (`payload` below)

function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function)
function pivottable(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function)
# `rows` vector indicating which columns are keys placed in rows
# `cols` vector indicating which columns are keys placed as column headers
# `value` integer indicating which column has values
Expand All @@ -99,8 +99,8 @@ function pivot_table(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}
hcat(row_key_df, payload)
end
# `mean` is the default aggregation function:
pivot_table(df::AbstractDataFrame, rows, cols, value) = pivot_table(df, rows, cols, value, mean)
pivot_table(df::AbstractDataFrame, rows, cols, value, fun) = pivot_table(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun)
pivottable(df::AbstractDataFrame, rows, cols, value) = pivottable(df, rows, cols, value, mean)
pivottable(df::AbstractDataFrame, rows, cols, value, fun) = pivottable(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun)


##############################################################################
Expand Down Expand Up @@ -157,7 +157,6 @@ Base.size(v::StackedVector) = (length(v),)
Base.length(v::StackedVector) = sum(map(length, v.components))
Base.ndims(v::StackedVector) = 1
Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...)
vecbind_type(v::StackedVector) = vecbind_promote_type(map(vecbind_type, v.components)...)
Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims)

Base.show(io::IO, v::StackedVector) = internal_show_vector(io, v)
Expand All @@ -179,7 +178,6 @@ Base.size(v::RepeatedVector) = (length(v),)
Base.length(v::RepeatedVector) = v.n * length(v.parent)
Base.ndims(v::RepeatedVector) = 1
Base.eltype{T}(v::RepeatedVector{T}) = T
vecbind_type(v::RepeatedVector) = vecbind_type(v.parent)
Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.n)
Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims)

Expand Down Expand Up @@ -208,7 +206,6 @@ Base.size(v::EachRepeatedVector) = (length(v),)
Base.length(v::EachRepeatedVector) = v.n * length(v.parent)
Base.ndims(v::EachRepeatedVector) = 1
Base.eltype{T}(v::EachRepeatedVector{T}) = T
vecbind_type(v::EachRepeatedVector) = vecbind_type(v.parent)
Base.reverse(v::EachRepeatedVector) = EachRepeatedVector(reverse(v.parent), v.n)
Base.similar(v::EachRepeatedVector, T, dims::Dims) = similar(v.parent, T, dims)

Expand Down Expand Up @@ -250,15 +247,15 @@ end

##############################################################################
##
## stack_df()
## melt_df()
## stackdf()
## meltdf()
## Reshaping using referencing (issue #145), using the above vector types
##
##############################################################################

# Same as `stack`, but uses references
# I'm not sure the name is very good
function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int})
N = length(measure_vars)
remainingcols = _setdiff([1:ncol(df)], measure_vars)
cnames = names(df)[id_vars]
Expand All @@ -270,10 +267,10 @@ function stack_df(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vec
[RepeatedVector(df[:,c], N) for c in id_vars]...}, # id_var columns
cnames)
end
stack_df(df::AbstractDataFrame, measure_vars) = stack_df(df, [index(df)[measure_vars]])
stackdf(df::AbstractDataFrame, measure_vars) = stackdf(df, [index(df)[measure_vars]])

stack_df(df::AbstractDataFrame, measure_vars, id_vars) = stack_df(df, [index(df)[measure_vars]], [index(df)[id_vars]])
stack_df(df::AbstractDataFrame, measure_vars) = stack_df(df, [index(df)[measure_vars]], _setdiff([1:ncol(df)], [index(df)[measure_vars]]))
stackdf(df::AbstractDataFrame, measure_vars, id_vars) = stackdf(df, [index(df)[measure_vars]], [index(df)[id_vars]])
stackdf(df::AbstractDataFrame, measure_vars) = stackdf(df, [index(df)[measure_vars]], _setdiff([1:ncol(df)], [index(df)[measure_vars]]))

melt_df(df::AbstractDataFrame, id_vars) = stack_df(df, _setdiff([1:ncol(df)], index(df)[id_vars]))
melt_df(df::AbstractDataFrame, id_vars, measure_vars) = stack_df(df, measure_vars, id_vars)
meltdf(df::AbstractDataFrame, id_vars) = stackdf(df, _setdiff([1:ncol(df)], index(df)[id_vars]))
meltdf(df::AbstractDataFrame, id_vars, measure_vars) = stackdf(df, measure_vars, id_vars)
Loading