Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subset #2496

Merged
merged 17 commits into from
Jan 11, 2021
Prev Previous commit
Next Next commit
changes after code review
bkamins committed Dec 20, 2020
commit d614e5594c4e0061bda4825b2af2840d53b313d2
7 changes: 6 additions & 1 deletion docs/src/man/comparisons.md
Original file line number Diff line number Diff line change
@@ -204,9 +204,14 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9,
| Rename columns | `rename(df, x_new = x)` | `rename(df, :x => :x_new)` |
| Pick columns | `select(df, x, y)` | `select(df, :x, :y)` |
| Pick & transform columns | `transmute(df, mean(x), y)` | `select(df, :x => mean, :y)` |
| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(>=(1)))` |
| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(x -> x >= 1))` |
| Sort rows | `arrange(df, x)` | `sort(df, :x)` |

Note that in the table above the `x -> x >= 1` predicate can be more compactly
written as `=>(1)`. The latter form has an additional benefit that it is
compiled only once per Julia session (as opposed to `x -> x >= 1` which defines
a new anonymous function every time it is introduced).

As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group:

| Operation | dplyr | DataFrames.jl |
179 changes: 106 additions & 73 deletions src/abstractdataframe/subset.jl
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) =
_process_subset_pair(i::Int, a) =
throw(ArgumentError("condition specifier $a is not supported by `subset`"))

function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
@nospecialize(args), skipmissing::Bool)
conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]

@@ -18,14 +18,42 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
df_conditions = select(df, conditions...,
copycols=!(parent(df) isa DataFrame), keepkeys=false)
end
test_type = skipmissing ? Union{Missing, Bool} : Bool
for col in eachcol(df_conditions)
if !(eltype(col) <: test_type)
if !(eltype(col) <: Union{Missing, Bool})
throw(ArgumentError("each transformation must produce a vector whose " *
"eltype is subtype of $test_type"))
"eltype is subtype of Union{Missing, Bool}"))
end
end

@assert ncol(df_conditions) == length(conditions)

if ncol(df_conditions) == 1
cond = df_conditions[!, 1]
else
cond = .&(eachcol(df_conditions)...)
end

@assert eltype(cond) <: Union{Bool, Missing}

if skipmissing
return coalesce.(cond, false)
else
# actually currently the inner condition is only evaluated if actually
# we have some missings in cond, but it might change in the future so
# I leave this check
if Missing <: eltype(cond)
pos = findfirst(ismissing, cond)
if pos !== nothing
# note that the user might have passed a GroupedDataFrame but we
# then referer to the parent data frame of this GroupedDataFrame
throw(ArgumentError("skipmissing=false and in row $pos of the" *
" passed conditions were evaluated to" *
" missing value"))
end
end
return cond
end

if ncol(df_conditions) == 1
return .===(df_conditions[!, 1], true)
else
@@ -40,9 +68,9 @@ end
Return a copy of data frame `df` or parent of `gdf` containing only rows for
which all values produced by transformation(s) `args` for a given row are `true`.

If `skipmissing=true`, returning `missing` in `args` is allowed and corresponding rows are
dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
present.
If `skipmissing=true`, producing `missing` in conjunction of results produced by
`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
(the default) an error is thrown if `missing` is produced.

Each argument passed in `args` can be either a single column selector or a
`source_columns => function` transformation specifier following the rules
@@ -65,54 +93,54 @@ See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@r
julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
z=[true, true, missing, missing], v=[1, 2, 11, 12])
4×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 │ 1 │ 1 1 │ 1 │ 1
│ 2 │ 2 │ 0 │ 1 │ 1 │ 2 │
│ 3 │ 3 │ 1 │ 0 │ missing │ 11
│ 4 │ 4 │ 0 │ 0 │ missing │ 12
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────────
1 │ 1 true true true 1
2 │ 2 false true true 2
3 │ 3 true false missing 11
4 │ 4 false false missing 12

julia> subset(df, :x)
2×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 │ 1 │ 1 1 │ 1 │ 1
│ 2 3 │ 1 │ 0 │ missing │ 11
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
1 │ 1 true true true 1
23 true false missing 11

julia> subset(df, :v => x -> x .> 3)
2×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 3 │ 1 │ 0 │ missing │ 11
│ 2 4 │ 0 │ 0 │ missing │ 12
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────────
13 true false missing 11
24 false false missing 12

julia> subset(df, :x, :y => ByRow(!))
1×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 3 │ 1 │ 0 │ missing │ 11
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
13 true false missing 11

julia> subset(df, :x, :z, skipmissing=true)
1×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────
│ 1 │ 1 │ 1 1 │ 1 │ 1 │
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────
1 │ 1 true true true 1

julia> subset(df, :x, :z)
ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value

julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
2×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 2 │ 0 │ 1 │ 1 │ 2 │
│ 2 4 │ 0 │ 0 │ missing │ 12
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────────
12 false true true 2
24 false false missing 12
```
"""
@inline function subset(df::AbstractDataFrame, @nospecialize(args...);
@@ -135,9 +163,9 @@ end
Update data frame `df` or the parent of `gdf` in place to contain only rows for
which all values produced by transformation(s) `args` for a given row is `true`.

If `skipmissing=true` returing `missing` in `args` is allowed and these rows are
dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
present.
If `skipmissing=true`, producing `missing` in conjunction of results produced by
`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
(the default) an error is thrown if `missing` is produced.

Each argument passed in `args` can be either a single column selector or a
`source_columns => function` transformation specifier following the rules
@@ -157,52 +185,57 @@ See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
```
julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false])
4×3 DataFrame
Row │ id x │ y │
│ Int64 Bool │ Bool │
─────┼───────────────────
│ 1 1 1 │ 1 │
│ 2 │ 2 │ 0 │ 1
│ 3 │ 3 │ 1 │ 0 │
│ 4 │ 4 │ 0 │ 0 │
Row │ id x y
│ Int64 Bool Bool
─────┼─────────────────────
1 │ 1 true true
2 │ 2 false true
3 │ 3 true false
4 │ 4 false false

julia> subset!(copy(df), :x, :y => ByRow(!));
julia> subset!(df, :x, :y => ByRow(!));

julia> df
4×3 DataFrame
│ Row │ id │ x │ y │
│ │ Int64 │ Bool │ Bool │
├─────┼───────┼──────┼──────┤
│ 1 │ 1 │ 1 │ 1 │
│ 2 │ 2 │ 0 │ 1 │
│ 3 │ 3 │ 1 │ 0 │
│ 4 │ 4 │ 0 │ 0 │
1×3 DataFrame
Row │ id x y
│ Int64 Bool Bool
─────┼────────────────────
1 │ 3 true false

julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]);

julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));

julia> df
2×3 DataFrame
Row │ id y │ v │
│ Int64 Bool │ Int64 │
─────┼────────────────────
│ 1 2 │ 1 │ 2 │
│ 2 4 │ 0 12
Row │ id y v
│ Int64 Bool Int64
─────┼────────────────────
12 true 2
24 false 12

julia> df = DataFrame(id=1:4, x=[true, false, true, false],
z=[true, true, missing, missing], v=1:4);
z=[true, true, missing, missing], v=1:4)
4×4 DataFrame
Row │ id x z v
│ Int64 Bool Bool? Int64
─────┼──────────────────────────────
1 │ 1 true true 1
2 │ 2 false true 2
3 │ 3 true missing 3
4 │ 4 false missing 4

julia> subset!(df, :x, :z)
ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value

julia> subset!(df, :x, :z, skipmissing=true);

julia> df
1×4 DataFrame
Row │ id x z │ v │
│ Int64 Bool Bool? Int64
─────┼───────────────────────────
│ 1 │ 1 │ 1 1 │ 1
Row │ id x z v
│ Int64 Bool Bool? Int64
─────┼───────────────────────────
1 │ 1 true true 1

julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
z=[true, true, missing, missing], v=[1, 2, 11, 12]);
@@ -211,11 +244,11 @@ julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));

julia> df
2×5 DataFrame
Row │ id x y z │ v │
│ Int64 Bool Bool Bool? Int64
─────┼───────────────────────────────────
│ 1 2 │ 0 │ 1 │ 1 │ 2 │
│ 2 4 │ 0 │ 0 │ missing │ 12
Row │ id x y z v
│ Int64 Bool Bool Bool? Int64
─────┼─────────────────────────────────────
12 false true true 2
24 false false missing 12
```
"""
function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false)
39 changes: 39 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -3300,6 +3300,9 @@ end
filter([:x, :id] => (x, id) -> x && id < 4, df)
@test_throws ArgumentError subset(df)
@test isempty(subset(df, :x, :x => ByRow(!)))
@test isempty(subset(df, :x => x -> false, :x => x -> missing))
@test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2)
end

for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
@@ -3325,16 +3328,22 @@ end
filter([:x, :id] => (x, id) -> x && id < 4, df)
@test_throws ArgumentError subset(gdf)
@test isempty(subset(gdf, :x, :x => ByRow(!)))
@test isempty(subset(gdf, :x => x -> false, :x => x -> missing))
@test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2)
end

df = copy(refdf)
@test subset!(df, :x) === df
@test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf)
df = copy(refdf)
@test_throws ArgumentError subset!(df, :y)
@test df ≅ refdf
df = copy(refdf)
@test subset!(df, :y, skipmissing=true) === df
@test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
df = copy(refdf)
@test subset!(df, :x, :y, skipmissing=true) === df
@test subset!(df, :x, :y, skipmissing=true) ≅ df ≅
filter([:x, :y] => (x, y) -> x && y === true, refdf)
df = copy(refdf)
@@ -3349,6 +3358,15 @@ end
@test isempty(subset!(df, :x, :x => ByRow(!)))
@test isempty(df)

df = copy(refdf)
@test isempty(subset!(df, :x => x -> false, :x => x -> missing))
df = copy(refdf)
@test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)

df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :x) === df
df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
@@ -3358,9 +3376,15 @@ end
@test df ≅ refdf
df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :y, skipmissing=true) === df
df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :x, :y, skipmissing=true) === df
df = copy(refdf)
gdf = groupby(df, :z)
@test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
filter([:x, :y] => (x, y) -> x && y === true, refdf)
df = copy(refdf)
@@ -3378,6 +3402,13 @@ end
gdf = groupby(df, :z)
@test isempty(subset!(gdf, :x, :x => ByRow(!)))
@test isempty(df)
df = copy(refdf)
gdf = groupby(df, :z)
@test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
df = copy(refdf)
gdf = groupby(df, :z)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)

df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
@@ -3409,6 +3440,14 @@ end
@test isempty(subset!(gdf, :x, :x => ByRow(!)))
@test isempty(df)

df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
@test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)

@test_throws ArgumentError subset!(view(refdf, :, :), :x)
@test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x)