From 630660e90282f118f208ec5cb464ae994adf4276 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 26 Dec 2017 22:31:14 +0100 Subject: [PATCH] Add filter() and filter!() methods (#1330) Currently these cannot be made efficient as the column types are not encoded in DataFrameRow, but this can be improved in the future with NamedTuples. --- docs/src/lib/functions.md | 2 + docs/src/man/subsets.md | 23 +++++++++ src/abstractdataframe/abstractdataframe.jl | 57 ++++++++++++++++++++++ test/data.jl | 8 +++ 4 files changed, 90 insertions(+) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 61f40f9d03..a83c23b5fb 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -37,6 +37,8 @@ dropmissing! eachcol eachrow eltypes +filter +filter! head names names! diff --git a/docs/src/man/subsets.md b/docs/src/man/subsets.md index b20d4ce189..a24940acbd 100644 --- a/docs/src/man/subsets.md +++ b/docs/src/man/subsets.md @@ -84,3 +84,26 @@ julia> df[1:3, [:B, :A]] │ 3 │ 6 │ 3 │ ``` + +Selecting a subset of rows matching a condition: + +```jldoctest subsets +julia> filter(row -> row[:A] > 5, df) +5×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼────┼────┤ +│ 1 │ 6 │ 12 │ +│ 2 │ 7 │ 14 │ +│ 3 │ 8 │ 16 │ +│ 4 │ 9 │ 18 │ +│ 5 │ 10 │ 20 │ + +julia> filter(row -> (row[:A] > 5) && (row[:B] < 15), df) +2×2 DataFrames.DataFrame +│ Row │ A │ B │ +├─────┼───┼────┤ +│ 1 │ 6 │ 12 │ +│ 2 │ 7 │ 14 │ +``` + +`filter!` behaves the same way, but modifies the data frame in place. \ No newline at end of file diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 3262fb2ae6..ea292f5cb7 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -539,6 +539,63 @@ dropmissing!(df) """ dropmissing!(df::AbstractDataFrame) = deleterows!(df, find(!, completecases(df))) +""" + filter(function, df::AbstractDataFrame) + +Return a copy of data frame `df` containing only rows for which `function` +returns `true`. The function is passed a `DataFrameRow` as its only argument. + +# Examples +``` +julia> df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) +4×2 DataFrames.DataFrame +│ Row │ x │ y │ +├─────┼───┼───┤ +│ 1 │ 3 │ b │ +│ 2 │ 1 │ c │ +│ 3 │ 2 │ a │ +│ 4 │ 1 │ b │ + +julia> filter(row -> row[:x] > 1, df) +2×2 DataFrames.DataFrame +│ Row │ x │ y │ +├─────┼───┼───┤ +│ 1 │ 3 │ b │ +│ 2 │ 2 │ a │ +``` +""" +Base.filter(f, df::AbstractDataFrame) = df[collect(f(r)::Bool for r in eachrow(df)), :] + +""" + filter!(function, df::AbstractDataFrame) + +Remove rows from data frame `df` for which `function` returns `false`. +The function is passed a `DataFrameRow` as its only argument. + +# Examples +``` +julia> df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) +4×2 DataFrames.DataFrame +│ Row │ x │ y │ +├─────┼───┼───┤ +│ 1 │ 3 │ b │ +│ 2 │ 1 │ c │ +│ 3 │ 2 │ a │ +│ 4 │ 1 │ b │ + +julia> filter!(row -> row[:x] > 1, df); + +julia> df +2×2 DataFrames.DataFrame +│ Row │ x │ y │ +├─────┼───┼───┤ +│ 1 │ 3 │ b │ +│ 2 │ 2 │ a │ +``` +""" +Base.filter!(f, df::AbstractDataFrame) = + deleterows!(df, find(!f, eachrow(df))) + function Base.convert(::Type{Array}, df::AbstractDataFrame) convert(Matrix, df) end diff --git a/test/data.jl b/test/data.jl index 3b5bc949a9..cca0682a70 100644 --- a/test/data.jl +++ b/test/data.jl @@ -317,4 +317,12 @@ module TestData #test unique!() with extra argument unique!(df, [1, 3]) @test df == df1 + + #test filter() and filter!() + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + @test filter(r -> r[:x] > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) + @test filter!(r -> r[:x] > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1, missing], y = ["b", "c", "a", "b", "c"]) + @test_throws TypeError filter(r -> r[:x] > 1, df) + @test_throws TypeError filter!(r -> r[:x] > 1, df) end