Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subset #2496

Merged
merged 17 commits into from
Jan 11, 2021
36 changes: 23 additions & 13 deletions src/abstractdataframe/subset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ end

# Note that _get_subset_conditions will have a large compilation time
# if more than 32 conditions are passed as `args`.
@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
@nospecialize(args), skipmissing::Bool)
conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]

Expand All @@ -69,7 +69,8 @@ end

"""
subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false,
ungroup::Bool=true)

Return a copy of data frame `df` or parent of `gdf` containing only rows for
which all values produced by transformation(s) `args` for a given row are `true`.
Expand All @@ -79,8 +80,7 @@ Each argument passed in `args` can be either a single column selector or a
described for [`select`](@ref).

Note that as opposed to [`filter`](@ref) the `subset` function works on whole
columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
for which condition is false.
columns (or all rows in groups for `GroupedDataFrame`).

If `skipmissing=false` (the default) `args` are required to produce vectors
containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
Expand All @@ -89,10 +89,13 @@ returns `missing` are skipped).

If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`.

If `ungroup=false` the return value of the operation on `gdf` is re-grouped
based on the same grouping columns and `GroupedDataFrame` is returned.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

If a `GroupedDataFrame` is passed then it must include all groups present in the
`parent` data frame, like in [`select!`](@ref).

See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@ref)
See also: [`subset!`](@ref), [`filter`](@ref), [`select`](@ref)

# Examples

Expand Down Expand Up @@ -157,15 +160,18 @@ julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
end

@inline function subset(gdf::GroupedDataFrame, @nospecialize(args...);
skipmissing::Bool=false, view::Bool=false)
skipmissing::Bool=false, view::Bool=false,
ungroup::Bool=true)
row_selector = _get_subset_conditions(gdf, args, skipmissing)
df = parent(gdf)
return view ? Base.view(df, row_selector, :) : df[row_selector, :]
res = view ? Base.view(df, row_selector, :) : df[row_selector, :]
return ungroup ? res : groupby(res, groupcols(gdf))
end

"""
subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false)
subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false)
subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false,
ungroup::Bool=true)

Update data frame `df` or the parent of `gdf` in place to contain only rows for
which all values produced by transformation(s) `args` for a given row is `true`.
Expand All @@ -175,18 +181,20 @@ Each argument passed in `args` can be either a single column selector or a
described for [`select`](@ref).

Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
for which contition is false.
columns (or all rows in groups for `GroupedDataFrame`).

If `skipmissing=false` (the default) `args` are required to produce vectors
containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
allowed and it is treated as `false` (i.e. rows for which one of the conditions
returns `missing` are skipped).

If `ungroup=false` the return value of the operation on `gdf` is re-grouped
based on the same grouping columns and `GroupedDataFrame` is returned.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

If `GroupedDataFrame` is subsetted then it must include all groups present in the
`parent` data frame, like in [`select!`](@ref).

See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref)

# Examples

Expand Down Expand Up @@ -264,8 +272,10 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Boo
return delete!(df, findall(!, row_selector))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end

function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false)
function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false,
ungroup::Bool=true)
row_selector = _get_subset_conditions(gdf, args, skipmissing)
df = parent(gdf)
return delete!(df, findall(!, row_selector))
res = delete!(df, findall(!, row_selector))
return ungroup ? res : groupby(res, groupcols(gdf))
end
60 changes: 39 additions & 21 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3279,7 +3279,9 @@ end
id = 1:8)

for df in (copy(refdf), @view copy(refdf)[1:end-1, :])
df2 = copy(df)
@test subset(df, :x) ≅ filter(:x => identity, df)
@test df ≅ df2
@test subset(df, :x) isa DataFrame
@test subset(df, :x, view=true) ≅ filter(:x => identity, df)
@test subset(df, :x, view=true) isa SubDataFrame
Expand Down Expand Up @@ -3312,10 +3314,18 @@ end

for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]])
df2 = copy(df)
@test subset(gdf, :x) ≅ filter(:x => identity, df)
@test df ≅ df2
@test subset(gdf, :x) isa DataFrame
@test subset(gdf, :x, ungroup=false) ≅
groupby_checked(filter(:x => identity, df), :z)
@test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame}
@test subset(gdf, :x, view=true) ≅ filter(:x => identity, df)
@test subset(gdf, :x, view=true) isa SubDataFrame
@test subset(gdf, :x, view=true, ungroup=false) ≅
groupby_checked(filter(:x => identity, df), :z)
@test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame}
@test_throws ArgumentError subset(gdf, :y)
@test_throws ArgumentError subset(gdf, :y, :x)
@test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
Expand Down Expand Up @@ -3374,81 +3384,89 @@ end
@test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)

df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x) === df

df = copy(refdf)
gdf = groupby_checked(df, :z)
gdf2 = subset!(gdf, :x, ungroup=false)
@test gdf2 isa GroupedDataFrame{DataFrame}
@test parent(gdf2) === df
@test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z)

df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test_throws ArgumentError subset!(gdf, :y)
@test df ≅ refdf
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :y, skipmissing=true) === df
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x, :y, skipmissing=true) === df
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
filter([:x, :y] => (x, y) -> x && y === true, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
filter([:x, :id] => (x, id) -> x && id < 4, refdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test_throws ArgumentError subset!(gdf)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test isempty(subset!(gdf, :x, :x => ByRow(!)))
@test isempty(df)
df = copy(refdf)
gdf = groupby(df, :z)
gdf = groupby_checked(df, :z)
@test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)

df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test_throws ArgumentError subset!(gdf, :y)
@test df ≅ refdf
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
filter([:x, :y] => (x, y) -> x && y === true, refdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
filter([:x, :id] => (x, id) -> x && id < 4, refdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test_throws ArgumentError subset!(gdf)
df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test isempty(subset!(gdf, :x, :x => ByRow(!)))
@test isempty(df)

df = copy(refdf)
gdf = groupby(df, :z)[[3, 2, 1]]
gdf = groupby_checked(df, :z)[[3, 2, 1]]
@test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
@test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
Expand Down