Skip to content

Commit

Permalink
Updates to SubDataFrames, implementation of DataFrameRow
Browse files Browse the repository at this point in the history
* Parametrize SubDataFrames by T<:AbstractVector{Int}
* Changed SubDataFrame to immutable
* DRYed out sub() functions
* Deprecated subset (alias for sub), on the theory that we should
  remove redundant methods like this
* Add show() for DataFrameRow
* Change EachRow, EachCol -> eachrow, eachcol, to better match
  Base convention for iterators
* Return DataFrameRows from eachrow
* Changed Sort.lt comparisons to compare DataFrameRows (for issorted)
* Specialize collect(r::DataFrameRow), so that Dict(collect(r::DataFrameRow)) works
* Start to use size(df, 1) for nrow(df), size(df, 2) for ncol(df)
  • Loading branch information
kmsquire committed Jan 22, 2014
1 parent 64b18c0 commit e47accb
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 54 deletions.
8 changes: 6 additions & 2 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@ export # reconcile_groups,
complete_cases!,
cut,
DataFrame,
DataFrameRow,
DataStream,
describe,
dict,
drop_duplicates!,
duplicated,
EachCol,
EachRow,
eachcol,
eachrow,
findat,
flipud!,
flipud,
Expand Down Expand Up @@ -217,5 +218,8 @@ Base.@deprecate merge(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Any =
Base.@deprecate colnames(adf::AbstractDataFrame) Base.names
Base.@deprecate colnames!(adf::AbstractDataFrame, vals) names!
Base.@deprecate coltypes(adf::AbstractDataFrame) types
Base.@deprecate EachRow eachrow
Base.@deprecate EachCol eachcol
Base.@deprecate subset sub

end # module DataFrames
122 changes: 75 additions & 47 deletions src/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -913,37 +913,25 @@ end

# a SubDataFrame is a lightweight wrapper around a DataFrame used most frequently in
# split/apply sorts of operations.
type SubDataFrame <: AbstractDataFrame

immutable SubDataFrame{T<:AbstractVector{Int}} <: AbstractDataFrame
parent::DataFrame
rows::Vector{Int} # maps from subdf row indexes to parent row indexes
rows::T # maps from subdf row indexes to parent row indexes

function SubDataFrame(parent::DataFrame, rows::Vector{Int})
if any(rows .< 1)
error("all SubDataFrame indices must be > 0")
end
if length(rows) > 0 && maximum(rows) > nrow(parent)
error("all SubDataFrame indices must be <= the number of rows of the DataFrame")
function SubDataFrame(parent::DataFrame, rows::T)
if length(rows) > 0
rmin, rmax = extrema(rows)
if rmin < 1 || rmax > size(parent, 1)
throw(BoundsError())
end
end
new(parent, rows)
end
end

Base.sub(D::DataFrame, r, c) = sub(D[[c]], r) # If columns are given, pass in a subsetted parent D.
# Columns are not copies, so it's not expensive.
Base.sub(D::DataFrame, r::Int) = sub(D, [r])
Base.sub(D::DataFrame, rs::Vector{Int}) = SubDataFrame(D, rs)
Base.sub(D::DataFrame, r) = sub(D, getindex(SimpleIndex(nrow(D)), r)) # this is a wacky fall-through that uses light-weight fake indexes!
Base.sub(D::DataFrame, ex::Expr) = sub(D, with(D, ex))

Base.sub(D::SubDataFrame, r, c) = sub(D[[c]], r)
Base.sub(D::SubDataFrame, r::Int) = sub(D, [r])
Base.sub(D::SubDataFrame, rs::Vector{Int}) = SubDataFrame(D.parent, D.rows[rs])
Base.sub(D::SubDataFrame, r) = sub(D, getindex(SimpleIndex(nrow(D)), r)) # another wacky fall-through
Base.sub(D::SubDataFrame, ex::Expr) = sub(D, with(D, ex))
const subset = sub

Base.filter(ex::Expr, df::AbstractDataFrame) = subset(df, ex)
Base.select(ex::Expr, df::AbstractDataFrame) = subset(df, ex)
SubDataFrame(parent::DataFrame, row::Integer) = SubDataFrame(parent, [row])
SubDataFrame{T<:AbstractVector{Int}}(parent::DataFrame, rows::T) = SubDataFrame{T}(parent,rows)
SubDataFrame{S<:Integer}(parent::DataFrame, rows::AbstractVector{S}) = sub(parent, int(rows))

Base.getindex(df::SubDataFrame, c) = df.parent[df.rows, c]
Base.getindex(df::SubDataFrame, r, c) = df.parent[df.rows[r], c]
Expand All @@ -955,9 +943,45 @@ nrow(df::SubDataFrame) = length(df.rows)
ncol(df::SubDataFrame) = ncol(df.parent)
Base.names(df::SubDataFrame) = names(df.parent)

# Associative methods:
index(df::SubDataFrame) = index(df.parent)

# Sub definitions, creating a SubDataFrame

Base.sub{S<:Real}(D::DataFrame, rs::AbstractVector{S}) = SubDataFrame(D, rs)
Base.sub{S<:Real}(D::SubDataFrame, rs::AbstractVector{S}) = SubDataFrame(D.parent, D.rows[rs])
Base.sub(D::DataFrame, rs::AbstractVector{Bool}) = sub(D, getindex(SimpleIndex(nrow(D)), rs))
Base.sub(D::SubDataFrame, rs::AbstractVector{Bool}) = sub(D, getindex(SimpleIndex(nrow(D)), rs))

Base.sub(D::AbstractDataFrame, r::Integer) = SubDataFrame(D, Int[r])
Base.sub(D::AbstractDataFrame, ex::Expr) = sub(D, with(D, ex))
Base.sub(D::AbstractDataFrame, r) = sub(D, getindex(SimpleIndex(nrow(D)), r)) # fall-through that uses light-weight "fake" indexes

Base.sub(D::AbstractDataFrame, r, c) = sub(D[[c]], r)

Base.filter(ex::Expr, df::AbstractDataFrame) = sub(df, ex)
Base.select(ex::Expr, df::AbstractDataFrame) = sub(df, ex)

# Container for a DataFrame row

immutable DataFrameRow
df::AbstractDataFrame
row::Int
end

Base.getindex(r::DataFrameRow, idx::AbstractArray) = DataFrameRow(r.df[[idx]], r.row)
Base.getindex(r::DataFrameRow, idx) = r.df[r.row, idx]
Base.setindex!(r::DataFrameRow, value, idx) = setindex!(r.df, value, r.row, idx)
Base.names(df::DataFrameRow) = names(df.df)
Base.sub(r::DataFrameRow, c) = DataFrameRow(r.df[[c]], r.row)
index(r::DataFrameRow) = index(r.df)
length(r::DataFrameRow) = size(r.df, 2)
endof(r::DataFrameRow) = size(r.df, 2)
collect(r::DataFrameRow) = (String,Any)[x for x in r]

start(r::DataFrameRow) = 1
next(r::DataFrameRow, s) = ((names(r)[s], r[s]), s+1)
done(r::DataFrameRow, s) = s > length(r)

# delete!() deletes columns; deleterows!() deletes rows
# delete!(df, 1)
# delete!(df, "old")
Expand Down Expand Up @@ -1432,6 +1456,8 @@ function DataArrays.array(adf::AbstractDataFrame)
return res
end

DataArrays.array(r::DataFrameRow) = DataArrays.array(r.df[r.row,:])

function DataArrays.DataArray(adf::AbstractDataFrame,
T::DataType = reduce(typejoin, types(adf)))
n, p = size(adf)
Expand Down Expand Up @@ -1605,7 +1631,7 @@ end
DFPerm{O<:Ordering}(o::O, df::AbstractDataFrame) = DFPerm{O,typeof(df)}(o,df)

# For sorting, a and b are row indices (first two lt definitions)
# For issorted, the default row iterator returns SubDataFrames instead,
# For issorted, the default row iterator returns DataFrameRows instead,
# so two more lt function is defined below
function Base.Sort.lt{V<:AbstractVector}(o::DFPerm{V}, a, b)
for i = 1:ncol(o.df)
Expand All @@ -1631,24 +1657,24 @@ function Base.Sort.lt{O<:Ordering}(o::DFPerm{O}, a, b)
false
end

function Base.Sort.lt{V<:AbstractVector}(o::DFPerm{V}, a::AbstractDataFrame, b::AbstractDataFrame)
function Base.Sort.lt{V<:AbstractVector}(o::DFPerm{V}, a::DataFrameRow, b::DataFrameRow)
for i = 1:ncol(o.df)
if lt(o.ord[i], a[1,i], b[1,i])
if lt(o.ord[i], a[i], b[i])
return true
end
if lt(o.ord[i], b[1,i], a[1,i])
if lt(o.ord[i], b[i], a[i])
return false
end
end
false
end

function Base.Sort.lt{O<:Ordering}(o::DFPerm{O}, a::AbstractDataFrame, b::AbstractDataFrame)
function Base.Sort.lt{O<:Ordering}(o::DFPerm{O}, a::DataFrameRow, b::DataFrameRow)
for i = 1:ncol(o.df)
if lt(o.ord, a[1,i], b[1,i])
if lt(o.ord, a[i], b[i])
return true
end
if lt(o.ord, b[1,i], a[1,i])
if lt(o.ord, b[i], a[i])
return false
end
end
Expand Down Expand Up @@ -1818,7 +1844,7 @@ end
########################

issorted(df::AbstractDataFrame; cols={}, lt=isless, by=identity, rev=false, order=Forward) =
issorted(EachRow(df), ordering(df, cols, lt, by, rev, order))
issorted(eachrow(df), ordering(df, cols, lt, by, rev, order))

# sort!, sort, and sortperm functions

Expand Down Expand Up @@ -1904,39 +1930,41 @@ DataArrays.reorder(fun::Function, x::PooledDataArray, y::AbstractVector...) =

##############################################################################
##
## Iteration: EachRow, EachCol
## Iteration: eachrow, eachcol
##
##############################################################################

# Iteration by rows
type DFRowIterator
immutable DFRowIterator
df::AbstractDataFrame
end
EachRow(df::AbstractDataFrame) = DFRowIterator(df)
eachrow(df::AbstractDataFrame) = DFRowIterator(df)

Base.start(itr::DFRowIterator) = 1
Base.done(itr::DFRowIterator, i::Int) = i > nrow(itr.df)
Base.next(itr::DFRowIterator, i::Int) = (itr.df[i, :], i + 1)
Base.size(itr::DFRowIterator) = (nrow(itr.df), )
Base.length(itr::DFRowIterator) = nrow(itr.df)
Base.getindex(itr::DFRowIterator, i::Any) = itr.df[i, :]
Base.done(itr::DFRowIterator, i::Int) = i > size(itr.df, 1)
Base.next(itr::DFRowIterator, i::Int) = (DataFrameRow(itr.df, i), i + 1)
Base.size(itr::DFRowIterator) = (size(itr.df, 1), )
Base.length(itr::DFRowIterator) = size(itr.df, 1)
Base.getindex(itr::DFRowIterator, i::Any) = DataFrameRow(itr.df, i)
Base.map(f::Function, dfri::DFRowIterator) = [f(row) for row in dfri]


# Iteration by columns
type DFColumnIterator
immutable DFColumnIterator
df::AbstractDataFrame
end
EachCol(df::AbstractDataFrame) = DFColumnIterator(df)
eachcol(df::AbstractDataFrame) = DFColumnIterator(df)

Base.start(itr::DFColumnIterator) = 1
Base.done(itr::DFColumnIterator, j::Int) = j > ncol(itr.df)
Base.done(itr::DFColumnIterator, j::Int) = j > size(itr.df, 2)
Base.next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1)
Base.size(itr::DFColumnIterator) = (ncol(itr.df), )
Base.length(itr::DFColumnIterator) = ncol(itr.df)
Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
Base.length(itr::DFColumnIterator) = size(itr.df, 2)
Base.getindex(itr::DFColumnIterator, j::Any) = itr.df[:, j]
function Base.map(f::Function, dfci::DFColumnIterator)
# note: `f` must return a consistent length
res = DataFrame()
for i = 1:ncol(dfci.df)
for i = 1:size(dfci.df, 2)
res[i] = f(dfci[i])
end
names!(res, names(dfci.df))
Expand Down
10 changes: 10 additions & 0 deletions src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,16 @@ function Base.show(adf::AbstractDataFrame,
show(STDOUT, adf, splitchunks)
end

function Base.show(io::IO, r::DataFrameRow)
labelwidth = mapreduce(length, max, names(r)) + 2
@printf("DataFrameRow (row %d)\n", r.row)
for (label, value) in r
println(io, rpad(label, labelwidth, ' '), value)
end
end

Base.show(row::DataFrameRow) = show(STDOUT, row)

function Base.showall(io::IO,
adf::AbstractDataFrame,
splitchunks::Bool = false,
Expand Down
31 changes: 26 additions & 5 deletions test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,37 @@ module TestIteration
@assert ndims(el) == 0
end

for row in EachRow(df)
@assert isa(row, DataFrame)
for row in eachrow(df)
@assert isa(row, DataFrameRow)
@assert row["B"]-row["A"] == 1
end

for col in EachCol(df)
for col in eachcol(df)
@assert isa(col, AbstractDataVector)
end

@assert isequal(map(x -> minimum(array(x)), EachRow(df)), {1,2})
@assert isequal(map(minimum, EachCol(df)), DataFrame(A = [1], B = [2]))
@assert isequal(map(x -> minimum(array(x)), eachrow(df)), {1,2})
@assert isequal(map(minimum, eachcol(df)), DataFrame(A = [1], B = [2]))

row = DataFrameRow(df, 1)

row["A"] = 100
@assert df[1, "A"] == 100

row[1] = 101
@assert df[1, "A"] == 101

df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"])

s1 = sub(df, 1:3)
s1[2,"A"] = 4
@assert df[2, "A"] == 4
@assert sub(s1, 1:2) == sub(df, 1:2)

s2 = sub(df, 1:2:3)
s2[2, "B"] = "M"
@assert df[3, "B"] == "M"
@assert sub(s2, 1:1:2) == sub(df, [1,3])

# @test_fail for x in df; end # Raises an error
end

0 comments on commit e47accb

Please sign in to comment.