From 4342310af12ed12a3c1a48bf6128e546ef41ec31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 14 Dec 2020 13:44:44 +0100 Subject: [PATCH 01/16] add subset and subset! --- NEWS.md | 2 + docs/src/lib/functions.md | 2 + docs/src/man/comparisons.md | 4 +- src/DataFrames.jl | 3 + src/abstractdataframe/subset.jl | 231 ++++++++++++++++++++++++++++++++ test/grouping.jl | 162 ++++++++++++++++++++++ 6 files changed, 402 insertions(+), 2 deletions(-) create mode 100644 src/abstractdataframe/subset.jl diff --git a/NEWS.md b/NEWS.md index e9e9295f88..449f3e244f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ ## New functionalities +* add `subset` and `subset!` functions that allow to subset rows + ([#2496](https://github.com/JuliaData/DataFrames.jl/pull/2496)) ## Deprecated diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 128f0ac9e7..d6d327c94c 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -102,6 +102,8 @@ first last only nonunique +subset +subset! unique unique! ``` diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 64ba636af1..f8bbf6f470 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -204,7 +204,7 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9, | Rename columns | `rename(df, x_new = x)` | `rename(df, :x => :x_new)` | | Pick columns | `select(df, x, y)` | `select(df, :x, :y)` | | Pick & transform columns | `transmute(df, mean(x), y)` | `select(df, :x => mean, :y)` | -| Pick rows | `filter(df, x >= 1)` | `filter(:x => >=(1), df)` | +| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(>=(1)))` | | Sort rows | `arrange(df, x)` | `sort(df, :x)` | As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group: @@ -240,7 +240,7 @@ The following table compares the main functions of DataFrames.jl with Stata: | Add new columns | `egen x_mean = mean(x)` | `transform!(df, :x => mean => :x_mean)` | | Rename columns | `rename x x_new` | `rename!(df, :x => :x_new)` | | Pick columns | `keep x y` | `select!(df, :x, :y)` | -| Pick rows | `keep if x >= 1` | `filter!(:x => >=(1), df)` | +| Pick rows | `keep if x >= 1` | `subset!(df, :x => ByRow(>=(1))` | | Sort rows | `sort x` | `sort!(df, :x)` | Note that the suffix `!` (i.e. `transform!`, `select!`, etc) ensures that the operation transforms the dataframe in place, as in Stata diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 0715c40a3e..7417259461 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -64,6 +64,8 @@ export AbstractDataFrame, select, semijoin, stack, + subset, + subset!, transform, transform!, unique!, @@ -104,6 +106,7 @@ include("dataframerow/utils.jl") include("other/broadcasting.jl") include("abstractdataframe/selection.jl") +include("abstractdataframe/subset.jl") include("abstractdataframe/iteration.jl") include("abstractdataframe/join.jl") include("abstractdataframe/reshape.jl") diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl new file mode 100644 index 0000000000..7114cd19a9 --- /dev/null +++ b/src/abstractdataframe/subset.jl @@ -0,0 +1,231 @@ +# subset allows a transformation specification without a target column name or a column + +_process_subset_pair(i::Int, a::ColumnIndex) = a => Symbol(:x, i) +_process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) = + first(a) => last(a) => Symbol(:x, i) +_process_subset_pair(i::Int, a) = + throw(ArgumentError("condition specifier $a is not supported by `subset`")) + +function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, + @nospecialize(args), skipmissing::Bool) + conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] + + isempty(conditions) && throw(ArgumentError("at least one condition must be passed")) + + if df isa AbstractDataFrame + df_conditions = select(df, conditions..., copycols=!(df isa DataFrame)) + else + df_conditions = select(df, conditions..., + copycols=!(parent(df) isa DataFrame), keepkeys=false) + end + test_type = skipmissing ? Union{Missing, Bool} : Bool + for col in eachcol(df_conditions) + if !(eltype(col) <: test_type) + throw(ArgumentError("each transformation must produce a vector whose " * + "eltype is subtype of $test_type")) + end + end + @assert ncol(df_conditions) == length(conditions) + if ncol(df_conditions) == 1 + return .===(df_conditions[!, 1], true) + else + return .===(.&(eachcol(df_conditions)...), true) + end +end + +""" + subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false) + subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false) + +Return a copy of data frame `df` or parent of `gdf` containing only rows for +which all values produced by transformation(s) `args` for a given row is `true`. + +If `skipmissing=true` returing `missing` in `args` is allowed and these rows are +dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is +present. + +Each argument passed in `args` can be either a single column selector or a +`source_columns => function` transformation specifier following the rules +described for [`select`](@ref). + +Note that as opposed to [`filter`](@ref) the `subset` function works on whole +columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows +for which contition is false. + +If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. + +If `GroupedDataFrame` is subsetted then it must include all groups present in the +`parent` data frame, like in [`select!`](@ref). + +See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref) + +# Examples + +``` +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], + z=[true, true, missing, missing], v=[1, 2, 11, 12]) +4×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 0 │ 1 │ 1 │ 2 │ +│ 3 │ 3 │ 1 │ 0 │ missing │ 11 │ +│ 4 │ 4 │ 0 │ 0 │ missing │ 12 │ + +julia> subset(df, :x) +2×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +│ 2 │ 3 │ 1 │ 0 │ missing │ 11 │ + +julia> subset(df, :v => x -> x .> 3) +2×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 3 │ 1 │ 0 │ missing │ 11 │ +│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ + +julia> subset(df, :x, :y => ByRow(!)) +1×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 3 │ 1 │ 0 │ missing │ 11 │ + +julia> subset(df, :x, :z, skipmissing=true) +1×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼───────┼───────┤ +│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ + +julia> subset(df, :x, :z) +ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool + +julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) +2×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 2 │ 0 │ 1 │ 1 │ 2 │ +│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ +``` +""" +@inline function subset(df::AbstractDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false) + row_selector = _get_subset_conditions(df, args, skipmissing) + return view ? Base.view(df, row_selector, :) : df[row_selector, :] +end + +@inline function subset(gdf::GroupedDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false) + row_selector = _get_subset_conditions(gdf, args, skipmissing) + df = parent(gdf) + return view ? Base.view(df, row_selector, :) : df[row_selector, :] +end + +""" + subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false) + subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false) + +Update data frame `df` or the parent of `gdf` in place to contain only rows for +which all values produced by transformation(s) `args` for a given row is `true`. + +If `skipmissing=true` returing `missing` in `args` is allowed and these rows are +dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is +present. + +Each argument passed in `args` can be either a single column selector or a +`source_columns => function` transformation specifier following the rules +described for [`select`](@ref). + +Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole +columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows +for which contition is false. + +If `GroupedDataFrame` is subsetted then it must include all groups present in the +`parent` data frame, like in [`select!`](@ref). + +See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref) + +# Examples + +``` +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false]) +4×3 DataFrame +│ Row │ id │ x │ y │ +│ │ Int64 │ Bool │ Bool │ +├─────┼───────┼──────┼──────┤ +│ 1 │ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 0 │ 1 │ +│ 3 │ 3 │ 1 │ 0 │ +│ 4 │ 4 │ 0 │ 0 │ + +julia> subset!(copy(df), :x, :y => ByRow(!)); + +julia> df +4×3 DataFrame +│ Row │ id │ x │ y │ +│ │ Int64 │ Bool │ Bool │ +├─────┼───────┼──────┼──────┤ +│ 1 │ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 0 │ 1 │ +│ 3 │ 3 │ 1 │ 0 │ +│ 4 │ 4 │ 0 │ 0 │ + +julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]); + +julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); + +julia> df +2×3 DataFrame +│ Row │ id │ y │ v │ +│ │ Int64 │ Bool │ Int64 │ +├─────┼───────┼──────┼───────┤ +│ 1 │ 2 │ 1 │ 2 │ +│ 2 │ 4 │ 0 │ 12 │ + +julia> df = DataFrame(id=1:4, x=[true, false, true, false], + z=[true, true, missing, missing], v=1:4); + +julia> subset!(df, :x, :z) +ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool + +julia> subset!(df, :x, :z, skipmissing=true); + +julia> df +1×4 DataFrame +│ Row │ id │ x │ z │ v │ +│ │ Int64 │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼───────┼───────┤ +│ 1 │ 1 │ 1 │ 1 │ 1 │ + +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], + z=[true, true, missing, missing], v=[1, 2, 11, 12]); + +julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); + +julia> df +2×5 DataFrame +│ Row │ id │ x │ y │ z │ v │ +│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ +├─────┼───────┼──────┼──────┼─────────┼───────┤ +│ 1 │ 2 │ 0 │ 1 │ 1 │ 2 │ +│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ +``` +""" +function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false) + row_selector = _get_subset_conditions(df, args, skipmissing) + return delete!(df, findall(!, row_selector)) +end + +function subset!(gdf::GroupedDataFrame, @nospecialize(args...); + skipmissing::Bool=false) + row_selector = _get_subset_conditions(gdf, args, skipmissing) + df = parent(gdf) + return delete!(df, findall(!, row_selector)) +end diff --git a/test/grouping.jl b/test/grouping.jl index a2df4868ae..0e741ff351 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3272,4 +3272,166 @@ end @test df == df2 end +@testset "subset and subset!" begin + refdf = DataFrame(x = repeat([true, false], 4), + y = repeat([true, false, missing, missing], 2), + z = repeat([1, 2, 3, 3], 2), + id = 1:8) + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]) + @test subset(df, :x) ≅ filter(:x => identity, df) + @test subset(df, :x) isa DataFrame + @test subset(df, :x, view=true) ≅ filter(:x => identity, df) + @test subset(df, :x, view=true) isa SubDataFrame + @test_throws ArgumentError subset(df, :y) + @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(df) + @test isempty(subset(df, :x, :x => ByRow(!))) + end + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), + gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]]) + @test subset(gdf, :x) ≅ filter(:x => identity, df) + @test subset(gdf, :x) isa DataFrame + @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) + @test subset(gdf, :x, view=true) isa SubDataFrame + @test_throws ArgumentError subset(gdf, :y) + @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(gdf) + @test isempty(subset(gdf, :x, :x => ByRow(!))) + end + + df = copy(refdf) + @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df, :y) + @test df ≅ refdf + df = copy(refdf) + @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df) + df = copy(refdf) + @test isempty(subset!(df, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + + @test_throws ArgumentError subset!(view(refdf, :, :), :x) + @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x) + + df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9) + @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4) + @test subset(groupby_checked(df, :g), :x => x -> x .< mean(x)) == + DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7]) + + @test_throws ArgumentError subset(df, :x => x -> missing) + @test isempty(subset(df, :x => x -> missing, skipmissing=true)) + @test isempty(subset(df, :x => x -> false)) + @test subset(df, :x => x -> true) ≅ df + @test_throws ArgumentError subset(df, :x => x -> (a=x,)) + @test_throws ArgumentError subset(df, :x => x -> (a=x,) => AsTable) +end + +@testset "make sure we handle idx correctly when groups are reordered" begin + df = DataFrame(g=[2, 2, 1, 1, 1], id = 1:5) + @test select(df, :g, :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g), :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g, sort=true), :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g)[[2,1]], :id, :id => ByRow(identity) => :id2) == + [df DataFrame(id2=df.id)] +end + end # module From be7c506410dc0ccad65685945d50b0c8ff0ad6a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 14 Dec 2020 15:52:10 +0100 Subject: [PATCH 02/16] fix typo in test --- test/grouping.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/grouping.jl b/test/grouping.jl index 0e741ff351..4ae2ecb553 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3422,7 +3422,7 @@ end @test isempty(subset(df, :x => x -> false)) @test subset(df, :x => x -> true) ≅ df @test_throws ArgumentError subset(df, :x => x -> (a=x,)) - @test_throws ArgumentError subset(df, :x => x -> (a=x,) => AsTable) + @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable) end @testset "make sure we handle idx correctly when groups are reordered" begin From 6c4cf8c0cb12e4531690bed357482ed801847c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 20 Dec 2020 16:50:39 +0100 Subject: [PATCH 03/16] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/subset.jl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 7114cd19a9..aad3fa6fd8 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -7,7 +7,7 @@ _process_subset_pair(i::Int, a) = throw(ArgumentError("condition specifier $a is not supported by `subset`")) function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, - @nospecialize(args), skipmissing::Bool) + @nospecialize(args), skipmissing::Bool) conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] isempty(conditions) && throw(ArgumentError("at least one condition must be passed")) @@ -38,9 +38,9 @@ end subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false) Return a copy of data frame `df` or parent of `gdf` containing only rows for -which all values produced by transformation(s) `args` for a given row is `true`. +which all values produced by transformation(s) `args` for a given row are `true`. -If `skipmissing=true` returing `missing` in `args` is allowed and these rows are +If `skipmissing=true`, returning `missing` in `args` is allowed and corresponding rows are dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is present. @@ -50,14 +50,14 @@ described for [`select`](@ref). Note that as opposed to [`filter`](@ref) the `subset` function works on whole columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows -for which contition is false. +for which condition is false. If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. -If `GroupedDataFrame` is subsetted then it must include all groups present in the +If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). -See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref) +See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@ref) # Examples @@ -190,7 +190,7 @@ julia> df │ 2 │ 4 │ 0 │ 12 │ julia> df = DataFrame(id=1:4, x=[true, false, true, false], - z=[true, true, missing, missing], v=1:4); + z=[true, true, missing, missing], v=1:4); julia> subset!(df, :x, :z) ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool @@ -205,7 +205,7 @@ julia> df │ 1 │ 1 │ 1 │ 1 │ 1 │ julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], - z=[true, true, missing, missing], v=[1, 2, 11, 12]); + z=[true, true, missing, missing], v=[1, 2, 11, 12]); julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); @@ -223,8 +223,7 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Boo return delete!(df, findall(!, row_selector)) end -function subset!(gdf::GroupedDataFrame, @nospecialize(args...); - skipmissing::Bool=false) +function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false) row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf) return delete!(df, findall(!, row_selector)) From d614e5594c4e0061bda4825b2af2840d53b313d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 20 Dec 2020 17:44:20 +0100 Subject: [PATCH 04/16] changes after code review --- docs/src/man/comparisons.md | 7 +- src/abstractdataframe/subset.jl | 179 +++++++++++++++++++------------- test/grouping.jl | 39 +++++++ 3 files changed, 151 insertions(+), 74 deletions(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index f8bbf6f470..28c819876d 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -204,9 +204,14 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9, | Rename columns | `rename(df, x_new = x)` | `rename(df, :x => :x_new)` | | Pick columns | `select(df, x, y)` | `select(df, :x, :y)` | | Pick & transform columns | `transmute(df, mean(x), y)` | `select(df, :x => mean, :y)` | -| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(>=(1)))` | +| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(x -> x >= 1))` | | Sort rows | `arrange(df, x)` | `sort(df, :x)` | +Note that in the table above the `x -> x >= 1` predicate can be more compactly +written as `=>(1)`. The latter form has an additional benefit that it is +compiled only once per Julia session (as opposed to `x -> x >= 1` which defines +a new anonymous function every time it is introduced). + As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group: | Operation | dplyr | DataFrames.jl | diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index aad3fa6fd8..eddb4f0a9f 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -6,7 +6,7 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) = _process_subset_pair(i::Int, a) = throw(ArgumentError("condition specifier $a is not supported by `subset`")) -function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, +@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, @nospecialize(args), skipmissing::Bool) conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] @@ -18,14 +18,42 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, df_conditions = select(df, conditions..., copycols=!(parent(df) isa DataFrame), keepkeys=false) end - test_type = skipmissing ? Union{Missing, Bool} : Bool for col in eachcol(df_conditions) - if !(eltype(col) <: test_type) + if !(eltype(col) <: Union{Missing, Bool}) throw(ArgumentError("each transformation must produce a vector whose " * - "eltype is subtype of $test_type")) + "eltype is subtype of Union{Missing, Bool}")) end end + @assert ncol(df_conditions) == length(conditions) + + if ncol(df_conditions) == 1 + cond = df_conditions[!, 1] + else + cond = .&(eachcol(df_conditions)...) + end + + @assert eltype(cond) <: Union{Bool, Missing} + + if skipmissing + return coalesce.(cond, false) + else + # actually currently the inner condition is only evaluated if actually + # we have some missings in cond, but it might change in the future so + # I leave this check + if Missing <: eltype(cond) + pos = findfirst(ismissing, cond) + if pos !== nothing + # note that the user might have passed a GroupedDataFrame but we + # then referer to the parent data frame of this GroupedDataFrame + throw(ArgumentError("skipmissing=false and in row $pos of the" * + " passed conditions were evaluated to" * + " missing value")) + end + end + return cond + end + if ncol(df_conditions) == 1 return .===(df_conditions[!, 1], true) else @@ -40,9 +68,9 @@ end Return a copy of data frame `df` or parent of `gdf` containing only rows for which all values produced by transformation(s) `args` for a given row are `true`. -If `skipmissing=true`, returning `missing` in `args` is allowed and corresponding rows are -dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is -present. +If `skipmissing=true`, producing `missing` in conjunction of results produced by +`args` is allowed and corresponding rows are dropped. If `skipmissing=false` +(the default) an error is thrown if `missing` is produced. Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules @@ -65,54 +93,54 @@ See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@r julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], z=[true, true, missing, missing], v=[1, 2, 11, 12]) 4×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ -│ 2 │ 2 │ 0 │ 1 │ 1 │ 2 │ -│ 3 │ 3 │ 1 │ 0 │ missing │ 11 │ -│ 4 │ 4 │ 0 │ 0 │ missing │ 12 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 1 true true true 1 + 2 │ 2 false true true 2 + 3 │ 3 true false missing 11 + 4 │ 4 false false missing 12 julia> subset(df, :x) 2×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ -│ 2 │ 3 │ 1 │ 0 │ missing │ 11 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼──────────────────────────────────── + 1 │ 1 true true true 1 + 2 │ 3 true false missing 11 julia> subset(df, :v => x -> x .> 3) 2×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 3 │ 1 │ 0 │ missing │ 11 │ -│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 3 true false missing 11 + 2 │ 4 false false missing 12 julia> subset(df, :x, :y => ByRow(!)) 1×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 3 │ 1 │ 0 │ missing │ 11 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼──────────────────────────────────── + 1 │ 3 true false missing 11 julia> subset(df, :x, :z, skipmissing=true) 1×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼───────┼───────┤ -│ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────── + 1 │ 1 true true true 1 julia> subset(df, :x, :z) -ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool +ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) 2×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 2 │ 0 │ 1 │ 1 │ 2 │ -│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 2 false true true 2 + 2 │ 4 false false missing 12 ``` """ @inline function subset(df::AbstractDataFrame, @nospecialize(args...); @@ -135,9 +163,9 @@ end Update data frame `df` or the parent of `gdf` in place to contain only rows for which all values produced by transformation(s) `args` for a given row is `true`. -If `skipmissing=true` returing `missing` in `args` is allowed and these rows are -dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is -present. +If `skipmissing=true`, producing `missing` in conjunction of results produced by +`args` is allowed and corresponding rows are dropped. If `skipmissing=false` +(the default) an error is thrown if `missing` is produced. Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules @@ -157,25 +185,22 @@ See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref) ``` julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false]) 4×3 DataFrame -│ Row │ id │ x │ y │ -│ │ Int64 │ Bool │ Bool │ -├─────┼───────┼──────┼──────┤ -│ 1 │ 1 │ 1 │ 1 │ -│ 2 │ 2 │ 0 │ 1 │ -│ 3 │ 3 │ 1 │ 0 │ -│ 4 │ 4 │ 0 │ 0 │ + Row │ id x y + │ Int64 Bool Bool +─────┼───────────────────── + 1 │ 1 true true + 2 │ 2 false true + 3 │ 3 true false + 4 │ 4 false false -julia> subset!(copy(df), :x, :y => ByRow(!)); +julia> subset!(df, :x, :y => ByRow(!)); julia> df -4×3 DataFrame -│ Row │ id │ x │ y │ -│ │ Int64 │ Bool │ Bool │ -├─────┼───────┼──────┼──────┤ -│ 1 │ 1 │ 1 │ 1 │ -│ 2 │ 2 │ 0 │ 1 │ -│ 3 │ 3 │ 1 │ 0 │ -│ 4 │ 4 │ 0 │ 0 │ +1×3 DataFrame + Row │ id x y + │ Int64 Bool Bool +─────┼──────────────────── + 1 │ 3 true false julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]); @@ -183,26 +208,34 @@ julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); julia> df 2×3 DataFrame -│ Row │ id │ y │ v │ -│ │ Int64 │ Bool │ Int64 │ -├─────┼───────┼──────┼───────┤ -│ 1 │ 2 │ 1 │ 2 │ -│ 2 │ 4 │ 0 │ 12 │ + Row │ id y v + │ Int64 Bool Int64 +─────┼───────────────────── + 1 │ 2 true 2 + 2 │ 4 false 12 julia> df = DataFrame(id=1:4, x=[true, false, true, false], - z=[true, true, missing, missing], v=1:4); + z=[true, true, missing, missing], v=1:4) +4×4 DataFrame + Row │ id x z v + │ Int64 Bool Bool? Int64 +─────┼────────────────────────────── + 1 │ 1 true true 1 + 2 │ 2 false true 2 + 3 │ 3 true missing 3 + 4 │ 4 false missing 4 julia> subset!(df, :x, :z) -ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool +ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value julia> subset!(df, :x, :z, skipmissing=true); julia> df 1×4 DataFrame -│ Row │ id │ x │ z │ v │ -│ │ Int64 │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼───────┼───────┤ -│ 1 │ 1 │ 1 │ 1 │ 1 │ + Row │ id x z v + │ Int64 Bool Bool? Int64 +─────┼─────────────────────────── + 1 │ 1 true true 1 julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], z=[true, true, missing, missing], v=[1, 2, 11, 12]); @@ -211,11 +244,11 @@ julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); julia> df 2×5 DataFrame -│ Row │ id │ x │ y │ z │ v │ -│ │ Int64 │ Bool │ Bool │ Bool? │ Int64 │ -├─────┼───────┼──────┼──────┼─────────┼───────┤ -│ 1 │ 2 │ 0 │ 1 │ 1 │ 2 │ -│ 2 │ 4 │ 0 │ 0 │ missing │ 12 │ + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 2 false true true 2 + 2 │ 4 false false missing 12 ``` """ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false) diff --git a/test/grouping.jl b/test/grouping.jl index 4ae2ecb553..725ce338ff 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3300,6 +3300,9 @@ end filter([:x, :id] => (x, id) -> x && id < 4, df) @test_throws ArgumentError subset(df) @test isempty(subset(df, :x, :x => ByRow(!))) + @test isempty(subset(df, :x => x -> false, :x => x -> missing)) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2) end for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), @@ -3325,16 +3328,22 @@ end filter([:x, :id] => (x, id) -> x && id < 4, df) @test_throws ArgumentError subset(gdf) @test isempty(subset(gdf, :x, :x => ByRow(!))) + @test isempty(subset(gdf, :x => x -> false, :x => x -> missing)) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2) end df = copy(refdf) + @test subset!(df, :x) === df @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf) df = copy(refdf) @test_throws ArgumentError subset!(df, :y) @test df ≅ refdf df = copy(refdf) + @test subset!(df, :y, skipmissing=true) === df @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) df = copy(refdf) + @test subset!(df, :x, :y, skipmissing=true) === df @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅ filter([:x, :y] => (x, y) -> x && y === true, refdf) df = copy(refdf) @@ -3349,6 +3358,15 @@ end @test isempty(subset!(df, :x, :x => ByRow(!))) @test isempty(df) + df = copy(refdf) + @test isempty(subset!(df, :x => x -> false, :x => x -> missing)) + df = copy(refdf) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) + + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x) === df df = copy(refdf) gdf = groupby(df, :z) @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) @@ -3358,9 +3376,15 @@ end @test df ≅ refdf df = copy(refdf) gdf = groupby(df, :z) + @test subset!(gdf, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby(df, :z) @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) df = copy(refdf) gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby(df, :z) @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ filter([:x, :y] => (x, y) -> x && y === true, refdf) df = copy(refdf) @@ -3378,6 +3402,13 @@ end gdf = groupby(df, :z) @test isempty(subset!(gdf, :x, :x => ByRow(!))) @test isempty(df) + df = copy(refdf) + gdf = groupby(df, :z) + @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing)) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) df = copy(refdf) gdf = groupby(df, :z)[[3, 2, 1]] @@ -3409,6 +3440,14 @@ end @test isempty(subset!(gdf, :x, :x => ByRow(!))) @test isempty(df) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing)) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) + @test_throws ArgumentError subset!(view(refdf, :, :), :x) @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x) From 1859935781b322e02983439a70b0d35b8c60446a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 20 Dec 2020 22:33:09 +0100 Subject: [PATCH 05/16] remove dead code --- src/abstractdataframe/subset.jl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index eddb4f0a9f..1d551e91df 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -53,12 +53,6 @@ _process_subset_pair(i::Int, a) = end return cond end - - if ncol(df_conditions) == 1 - return .===(df_conditions[!, 1], true) - else - return .===(.&(eachcol(df_conditions)...), true) - end end """ From fe29da164c6786f4f755ed80c200fc302b272e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 Jan 2021 16:51:01 +0100 Subject: [PATCH 06/16] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/comparisons.md | 2 +- src/abstractdataframe/subset.jl | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 28c819876d..2730b64f61 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -245,7 +245,7 @@ The following table compares the main functions of DataFrames.jl with Stata: | Add new columns | `egen x_mean = mean(x)` | `transform!(df, :x => mean => :x_mean)` | | Rename columns | `rename x x_new` | `rename!(df, :x => :x_new)` | | Pick columns | `keep x y` | `select!(df, :x, :y)` | -| Pick rows | `keep if x >= 1` | `subset!(df, :x => ByRow(>=(1))` | +| Pick rows | `keep if x >= 1` | `subset!(df, :x => ByRow(x -> x >= 1)` | | Sort rows | `sort x` | `sort!(df, :x)` | Note that the suffix `!` (i.e. `transform!`, `select!`, etc) ensures that the operation transforms the dataframe in place, as in Stata diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 1d551e91df..3d0e8282d1 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -38,17 +38,17 @@ _process_subset_pair(i::Int, a) = if skipmissing return coalesce.(cond, false) else - # actually currently the inner condition is only evaluated if actually + # currently the inner condition is only evaluated if actually # we have some missings in cond, but it might change in the future so # I leave this check if Missing <: eltype(cond) pos = findfirst(ismissing, cond) if pos !== nothing # note that the user might have passed a GroupedDataFrame but we - # then referer to the parent data frame of this GroupedDataFrame - throw(ArgumentError("skipmissing=false and in row $pos of the" * + # then refer to the parent data frame of this GroupedDataFrame + throw(ArgumentError("skipmissing=false but at row $pos the" * " passed conditions were evaluated to" * - " missing value")) + " a missing value")) end end return cond @@ -62,9 +62,12 @@ end Return a copy of data frame `df` or parent of `gdf` containing only rows for which all values produced by transformation(s) `args` for a given row are `true`. -If `skipmissing=true`, producing `missing` in conjunction of results produced by -`args` is allowed and corresponding rows are dropped. If `skipmissing=false` -(the default) an error is thrown if `missing` is produced. +If `skipmissing=false` (the default), an error is thrown if the conjunction (i.e. `&`) +of results produced by `args` is `missing`. If `skipmissing=true`, `missing` +is allowed and corresponding rows are dropped. Note that if some transformations +in `args` return `false` and others `missing` for a given row, the conjunction is `false`, +but that if some return `true` and others `missing`, the conjunction is `missing` +([three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic)). Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules @@ -126,7 +129,7 @@ julia> subset(df, :x, :z, skipmissing=true) 1 │ 1 true true true 1 julia> subset(df, :x, :z) -ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value +ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) 2×5 DataFrame @@ -220,7 +223,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false], 4 │ 4 false missing 4 julia> subset!(df, :x, :z) -ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value +ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value julia> subset!(df, :x, :z, skipmissing=true); From 33966dfd625db8f4bbc184dce1615648ca92da6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 Jan 2021 19:10:01 +0100 Subject: [PATCH 07/16] changes after code review --- docs/src/man/comparisons.md | 10 ++--- src/abstractdataframe/subset.jl | 78 ++++++++++++++------------------- test/grouping.jl | 24 +++++----- 3 files changed, 52 insertions(+), 60 deletions(-) diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 2730b64f61..fabfa5ef36 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -12,6 +12,11 @@ df = DataFrame(grp = repeat(1:2, 3), x = 6:-1:1, y = 4:9, z = [3:7; missing], id df2 = DataFrame(grp = [1, 3], w = [10, 11]) ``` +Note that in the comparisons presented below predicates like `x -> x >= 1` can +be more compactly written as `=>(1)`. The latter form has an additional benefit +that it is compiled only once per Julia session (as opposed to `x -> x >= 1` +which defines a new anonymous function every time it is introduced). + ## Comparison with the Python package pandas The following table compares the main functions of DataFrames.jl with the Python package pandas (version 1.1.0): @@ -207,11 +212,6 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9, | Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(x -> x >= 1))` | | Sort rows | `arrange(df, x)` | `sort(df, :x)` | -Note that in the table above the `x -> x >= 1` predicate can be more compactly -written as `=>(1)`. The latter form has an additional benefit that it is -compiled only once per Julia session (as opposed to `x -> x >= 1` which defines -a new anonymous function every time it is introduced). - As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group: | Operation | dplyr | DataFrames.jl | diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 3d0e8282d1..8703d8ba10 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -6,6 +6,17 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) = _process_subset_pair(i::Int, a) = throw(ArgumentError("condition specifier $a is not supported by `subset`")) +_and(x::Bool) = x +_and(x::Bool, y::Bool...) = x && _and(y...) +_and(x::Any...) = + throw(ArgumentError("set of non-boolean values $x passed in boolean context")) + +_and_missing(x::Bool) = x +_and_missing(x::Bool, y::Union{Bool, Missing}...) = x && _and_missing(y...) +_and_missing(x::Missing, y::Union{Bool, Missing}...) = false +_and_missing(x::Any...) = + throw(ArgumentError("set of non-boolean values $x passed in boolean context")) + @noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, @nospecialize(args), skipmissing::Bool) conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] @@ -18,41 +29,17 @@ _process_subset_pair(i::Int, a) = df_conditions = select(df, conditions..., copycols=!(parent(df) isa DataFrame), keepkeys=false) end - for col in eachcol(df_conditions) - if !(eltype(col) <: Union{Missing, Bool}) - throw(ArgumentError("each transformation must produce a vector whose " * - "eltype is subtype of Union{Missing, Bool}")) - end - end @assert ncol(df_conditions) == length(conditions) - if ncol(df_conditions) == 1 - cond = df_conditions[!, 1] - else - cond = .&(eachcol(df_conditions)...) - end - - @assert eltype(cond) <: Union{Bool, Missing} - if skipmissing - return coalesce.(cond, false) + cond = _and_missing.(eachcol(df_conditions)...) else - # currently the inner condition is only evaluated if actually - # we have some missings in cond, but it might change in the future so - # I leave this check - if Missing <: eltype(cond) - pos = findfirst(ismissing, cond) - if pos !== nothing - # note that the user might have passed a GroupedDataFrame but we - # then refer to the parent data frame of this GroupedDataFrame - throw(ArgumentError("skipmissing=false but at row $pos the" * - " passed conditions were evaluated to" * - " a missing value")) - end - end - return cond + cond = _and.(eachcol(df_conditions)...) end + + @assert eltype(cond) === Bool + return cond end """ @@ -62,26 +49,26 @@ end Return a copy of data frame `df` or parent of `gdf` containing only rows for which all values produced by transformation(s) `args` for a given row are `true`. -If `skipmissing=false` (the default), an error is thrown if the conjunction (i.e. `&`) -of results produced by `args` is `missing`. If `skipmissing=true`, `missing` -is allowed and corresponding rows are dropped. Note that if some transformations -in `args` return `false` and others `missing` for a given row, the conjunction is `false`, -but that if some return `true` and others `missing`, the conjunction is `missing` -([three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic)). - Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules described for [`select`](@ref). Note that as opposed to [`filter`](@ref) the `subset` function works on whole -columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows +columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows for which condition is false. +If `skipmissing=false` (the default) vectors containing only `Bool` values are +allowed to be produced by `args`. If `skipmissing=true`, additionally `missing` +is allowed and it is treated as `false` (corresponding rows are skipped). + If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). +Note that this function will have a large compilation time if more than 32 +conditions are passed as `args`. + See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@ref) # Examples @@ -129,7 +116,7 @@ julia> subset(df, :x, :z, skipmissing=true) 1 │ 1 true true true 1 julia> subset(df, :x, :z) -ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value +ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) 2×5 DataFrame @@ -160,21 +147,24 @@ end Update data frame `df` or the parent of `gdf` in place to contain only rows for which all values produced by transformation(s) `args` for a given row is `true`. -If `skipmissing=true`, producing `missing` in conjunction of results produced by -`args` is allowed and corresponding rows are dropped. If `skipmissing=false` -(the default) an error is thrown if `missing` is produced. - Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules described for [`select`](@ref). Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole -columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows +columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows for which contition is false. +If `skipmissing=false` (the default) vectors containing only `Bool` values are +allowed to be produced by `args`. If `skipmissing=true`, additionally `missing` +is allowed and it is treated as `false` (corresponding rows are skipped). + If `GroupedDataFrame` is subsetted then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). +Note that this function will have a large compilation time if more than 32 +conditions are passed as `args`. + See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref) # Examples @@ -223,7 +213,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false], 4 │ 4 false missing 4 julia> subset!(df, :x, :z) -ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value +ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context julia> subset!(df, :x, :z, skipmissing=true); diff --git a/test/grouping.jl b/test/grouping.jl index 725ce338ff..e76e353632 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3273,7 +3273,7 @@ end end @testset "subset and subset!" begin - refdf = DataFrame(x = repeat([true, false], 4), + refdf = DataFrame(x = repeat(Any[true, false], 4), y = repeat([true, false, missing, missing], 2), z = repeat([1, 2, 3, 3], 2), id = 1:8) @@ -3300,7 +3300,7 @@ end filter([:x, :id] => (x, id) -> x && id < 4, df) @test_throws ArgumentError subset(df) @test isempty(subset(df, :x, :x => ByRow(!))) - @test isempty(subset(df, :x => x -> false, :x => x -> missing)) + @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2) end @@ -3328,7 +3328,7 @@ end filter([:x, :id] => (x, id) -> x && id < 4, df) @test_throws ArgumentError subset(gdf) @test isempty(subset(gdf, :x, :x => ByRow(!))) - @test isempty(subset(gdf, :x => x -> false, :x => x -> missing)) + @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2) end @@ -3359,8 +3359,7 @@ end @test isempty(df) df = copy(refdf) - @test isempty(subset!(df, :x => x -> false, :x => x -> missing)) - df = copy(refdf) + @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) @@ -3404,9 +3403,7 @@ end @test isempty(df) df = copy(refdf) gdf = groupby(df, :z) - @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing)) - df = copy(refdf) - gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) @@ -3442,9 +3439,7 @@ end df = copy(refdf) gdf = groupby(df, :z)[[3, 2, 1]] - @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing)) - df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) @@ -3462,6 +3457,13 @@ end @test subset(df, :x => x -> true) ≅ df @test_throws ArgumentError subset(df, :x => x -> (a=x,)) @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable) + + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) end @testset "make sure we handle idx correctly when groups are reordered" begin From 48cc61800f4869f1026472c782ac6886afb9604a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 Jan 2021 22:45:25 +0100 Subject: [PATCH 08/16] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/subset.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 8703d8ba10..27a0bade68 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -57,9 +57,10 @@ Note that as opposed to [`filter`](@ref) the `subset` function works on whole columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows for which condition is false. -If `skipmissing=false` (the default) vectors containing only `Bool` values are -allowed to be produced by `args`. If `skipmissing=true`, additionally `missing` -is allowed and it is treated as `false` (corresponding rows are skipped). +If `skipmissing=false` (the default) `args` are required to produce vectors containing +only `Bool` values. +If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false` +(i.e. rows for which one of the conditions returns `missing` are skipped). If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. From 1c286395946b9f61564b5a624198d7b288c3f12b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 Jan 2021 23:09:26 +0100 Subject: [PATCH 09/16] corrections after code review --- src/abstractdataframe/subset.jl | 58 ++++++++++++++++++++++----------- test/grouping.jl | 12 +++++++ 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 27a0bade68..7a035ef91b 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -6,17 +6,42 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) = _process_subset_pair(i::Int, a) = throw(ArgumentError("condition specifier $a is not supported by `subset`")) +_and() = throw(ArgumentError("at least one condition must be passed")) _and(x::Bool) = x _and(x::Bool, y::Bool...) = x && _and(y...) -_and(x::Any...) = - throw(ArgumentError("set of non-boolean values $x passed in boolean context")) +function _and(x::Any...) + loc = findfirst(x -> !(x isa Bool), x) + # we know x has positive length and must contain non-boolean + @assert !isnothing(loc) + xv = x[loc] + if ismissing(xv) + throw(ArgumentError("missing was returned in condition number $loc " * + "but only true or false are allowed; pass " * + "skipmissing=true to skip missing values")) + else + throw(ArgumentError("value $xv was returned in condition number $loc " * + "but only true or false are allowed")) + end +end + +_and_missing() = throw(ArgumentError("at least one condition must be passed")) _and_missing(x::Bool) = x _and_missing(x::Bool, y::Union{Bool, Missing}...) = x && _and_missing(y...) _and_missing(x::Missing, y::Union{Bool, Missing}...) = false -_and_missing(x::Any...) = - throw(ArgumentError("set of non-boolean values $x passed in boolean context")) +function _and_missing(x::Any...) + loc = findfirst(x -> !(x isa Union{Bool, Missing}), x) + # we know x has positive length and must contain non-boolean + @assert !isnothing(loc) + xv = x[loc] + throw(ArgumentError("value $xv was returned in condition number $loc" * + "but only true, false, or missing are allowed")) +end + + +# Note that _get_subset_conditions will have a large compilation time +# if more than 32 conditions are passed as `args`. @noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, @nospecialize(args), skipmissing::Bool) conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] @@ -57,19 +82,16 @@ Note that as opposed to [`filter`](@ref) the `subset` function works on whole columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows for which condition is false. -If `skipmissing=false` (the default) `args` are required to produce vectors containing -only `Bool` values. -If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false` -(i.e. rows for which one of the conditions returns `missing` are skipped). +If `skipmissing=false` (the default) `args` are required to produce vectors +containing only `Bool` values. If `skipmissing=true`, additionally `missing` is +allowed and it is treated as `false` (i.e. rows for which one of the conditions +returns `missing` are skipped). If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). -Note that this function will have a large compilation time if more than 32 -conditions are passed as `args`. - See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@ref) # Examples @@ -117,7 +139,7 @@ julia> subset(df, :x, :z, skipmissing=true) 1 │ 1 true true true 1 julia> subset(df, :x, :z) -ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context +ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) 2×5 DataFrame @@ -156,16 +178,14 @@ Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows for which contition is false. -If `skipmissing=false` (the default) vectors containing only `Bool` values are -allowed to be produced by `args`. If `skipmissing=true`, additionally `missing` -is allowed and it is treated as `false` (corresponding rows are skipped). +If `skipmissing=false` (the default) `args` are required to produce vectors +containing only `Bool` values. If `skipmissing=true`, additionally `missing` is +allowed and it is treated as `false` (i.e. rows for which one of the conditions +returns `missing` are skipped). If `GroupedDataFrame` is subsetted then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). -Note that this function will have a large compilation time if more than 32 -conditions are passed as `args`. - See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref) # Examples @@ -214,7 +234,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false], 4 │ 4 false missing 4 julia> subset!(df, :x, :z) -ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context +ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values julia> subset!(df, :x, :z, skipmissing=true); diff --git a/test/grouping.jl b/test/grouping.jl index e76e353632..d0c5af5397 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3284,10 +3284,15 @@ end @test subset(df, :x, view=true) ≅ filter(:x => identity, df) @test subset(df, :x, view=true) isa SubDataFrame @test_throws ArgumentError subset(df, :y) + @test_throws ArgumentError subset(df, :y, :x) @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) @test subset(df, :x, :y, skipmissing=true) ≅ filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) @test subset(df, :x, :y, skipmissing=true, view=true) ≅ filter([:x, :y] => (x, y) -> x && y === true, df) @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ @@ -3312,10 +3317,15 @@ end @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) @test subset(gdf, :x, view=true) isa SubDataFrame @test_throws ArgumentError subset(gdf, :y) + @test_throws ArgumentError subset(gdf, :y, :x) @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) @test subset(gdf, :x, :y, skipmissing=true) ≅ filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅ filter([:x, :y] => (x, y) -> x && y === true, df) @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ @@ -3460,9 +3470,11 @@ end @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y) @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x) @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y) @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y) @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x) @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) end From 9655dafd7c7eee8eac95128f29c34a9d5d04b967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 2 Jan 2021 12:28:13 +0100 Subject: [PATCH 10/16] improve test coverage --- test/grouping.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/grouping.jl b/test/grouping.jl index d0c5af5397..e105199794 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3476,6 +3476,14 @@ end @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x) @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) + + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true) + + @test_throws ArgumentError DataFrames._and() + @test_throws ArgumentError DataFrames._and_missing() end @testset "make sure we handle idx correctly when groups are reordered" begin From 058c06e316375e4a61c3d6eb0dba1f82269ade3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 3 Jan 2021 12:50:50 +0100 Subject: [PATCH 11/16] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/subset.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 7a035ef91b..d149dfa195 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -42,7 +42,7 @@ end # Note that _get_subset_conditions will have a large compilation time # if more than 32 conditions are passed as `args`. -@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, +function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, @nospecialize(args), skipmissing::Bool) conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] @@ -92,7 +92,7 @@ If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). -See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref), [`select`](@ref) +See also: [`subset!`](@ref), [`filter`](@ref), [`select`](@ref) # Examples @@ -186,7 +186,7 @@ returns `missing` are skipped). If `GroupedDataFrame` is subsetted then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). -See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref) +See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref) # Examples From 2ae30ca6db1938e2d58500495ee8812357f2729a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 3 Jan 2021 13:30:35 +0100 Subject: [PATCH 12/16] add ungroup kwarg and fix docstring --- src/abstractdataframe/subset.jl | 30 +++++++++++------ test/grouping.jl | 60 +++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index d149dfa195..15c807b551 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -69,7 +69,8 @@ end """ subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false) - subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false) + subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false, + ungroup::Bool=true) Return a copy of data frame `df` or parent of `gdf` containing only rows for which all values produced by transformation(s) `args` for a given row are `true`. @@ -79,8 +80,7 @@ Each argument passed in `args` can be either a single column selector or a described for [`select`](@ref). Note that as opposed to [`filter`](@ref) the `subset` function works on whole -columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows -for which condition is false. +columns (or all rows in groups for `GroupedDataFrame`). If `skipmissing=false` (the default) `args` are required to produce vectors containing only `Bool` values. If `skipmissing=true`, additionally `missing` is @@ -89,6 +89,9 @@ returns `missing` are skipped). If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. +If `ungroup=false` the return value of the operation on `gdf` is re-grouped +based on the same grouping columns and `GroupedDataFrame` is returned. + If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). @@ -157,15 +160,18 @@ julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) end @inline function subset(gdf::GroupedDataFrame, @nospecialize(args...); - skipmissing::Bool=false, view::Bool=false) + skipmissing::Bool=false, view::Bool=false, + ungroup::Bool=true) row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf) - return view ? Base.view(df, row_selector, :) : df[row_selector, :] + res = view ? Base.view(df, row_selector, :) : df[row_selector, :] + return ungroup ? res : groupby(res, groupcols(gdf)) end """ subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false) - subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false) + subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false, + ungroup::Bool=true) Update data frame `df` or the parent of `gdf` in place to contain only rows for which all values produced by transformation(s) `args` for a given row is `true`. @@ -175,14 +181,16 @@ Each argument passed in `args` can be either a single column selector or a described for [`select`](@ref). Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole -columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows -for which contition is false. +columns (or all rows in groups for `GroupedDataFrame`). If `skipmissing=false` (the default) `args` are required to produce vectors containing only `Bool` values. If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false` (i.e. rows for which one of the conditions returns `missing` are skipped). +If `ungroup=false` the return value of the operation on `gdf` is re-grouped +based on the same grouping columns and `GroupedDataFrame` is returned. + If `GroupedDataFrame` is subsetted then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). @@ -264,8 +272,10 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Boo return delete!(df, findall(!, row_selector)) end -function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false) +function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false, + ungroup::Bool=true) row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf) - return delete!(df, findall(!, row_selector)) + res = delete!(df, findall(!, row_selector)) + return ungroup ? res : groupby(res, groupcols(gdf)) end diff --git a/test/grouping.jl b/test/grouping.jl index e105199794..cfa3ae2a25 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3279,7 +3279,9 @@ end id = 1:8) for df in (copy(refdf), @view copy(refdf)[1:end-1, :]) + df2 = copy(df) @test subset(df, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 @test subset(df, :x) isa DataFrame @test subset(df, :x, view=true) ≅ filter(:x => identity, df) @test subset(df, :x, view=true) isa SubDataFrame @@ -3312,10 +3314,18 @@ end for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]]) + df2 = copy(df) @test subset(gdf, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 @test subset(gdf, :x) isa DataFrame + @test subset(gdf, :x, ungroup=false) ≅ + groupby_checked(filter(:x => identity, df), :z) + @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame} @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) @test subset(gdf, :x, view=true) isa SubDataFrame + @test subset(gdf, :x, view=true, ungroup=false) ≅ + groupby_checked(filter(:x => identity, df), :z) + @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame} @test_throws ArgumentError subset(gdf, :y) @test_throws ArgumentError subset(gdf, :y, :x) @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) @@ -3374,81 +3384,89 @@ end @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x) === df + + df = copy(refdf) + gdf = groupby_checked(df, :z) + gdf2 = subset!(gdf, :x, ungroup=false) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test parent(gdf2) === df + @test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z) + df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test_throws ArgumentError subset!(gdf, :y) @test df ≅ refdf df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :y, skipmissing=true) === df df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x, :y, skipmissing=true) === df df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ filter([:x, :y] => (x, y) -> x && y === true, refdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ filter([:x, :id] => (x, id) -> x && id < 4, refdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test_throws ArgumentError subset!(gdf) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test isempty(subset!(gdf, :x, :x => ByRow(!))) @test isempty(df) df = copy(refdf) - gdf = groupby(df, :z) + gdf = groupby_checked(df, :z) @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test_throws ArgumentError subset!(gdf, :y) @test df ≅ refdf df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ filter([:x, :y] => (x, y) -> x && y === true, refdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ filter([:x, :id] => (x, id) -> x && id < 4, refdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test_throws ArgumentError subset!(gdf) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test isempty(subset!(gdf, :x, :x => ByRow(!))) @test isempty(df) df = copy(refdf) - gdf = groupby(df, :z)[[3, 2, 1]] + gdf = groupby_checked(df, :z)[[3, 2, 1]] @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) From 2f7a9192a6be9275eb0074da09304cf52afbc767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 3 Jan 2021 17:31:19 +0100 Subject: [PATCH 13/16] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/subset.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 15c807b551..be955d6711 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -89,8 +89,8 @@ returns `missing` are skipped). If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. -If `ungroup=false` the return value of the operation on `gdf` is re-grouped -based on the same grouping columns and `GroupedDataFrame` is returned. +If `ungroup=false` the resulting data frame is re-grouped based on the same +grouping columns as `gdf` and a `GroupedDataFrame` is returned. If a `GroupedDataFrame` is passed then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). From 1e481c6f439b34c8644e61cb0d233b52ebc914a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 3 Jan 2021 17:31:52 +0100 Subject: [PATCH 14/16] fix doc --- src/abstractdataframe/subset.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index be955d6711..ca8ea52a1a 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -188,8 +188,8 @@ containing only `Bool` values. If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false` (i.e. rows for which one of the conditions returns `missing` are skipped). -If `ungroup=false` the return value of the operation on `gdf` is re-grouped -based on the same grouping columns and `GroupedDataFrame` is returned. +If `ungroup=false` the resulting data frame is re-grouped based on the same +grouping columns as `gdf` and a `GroupedDataFrame` is returned. If `GroupedDataFrame` is subsetted then it must include all groups present in the `parent` data frame, like in [`select!`](@ref). From e43afb535a1c1f704841ad32215a3e3435d08694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 3 Jan 2021 18:42:49 +0100 Subject: [PATCH 15/16] add TODO for faster groupby --- src/abstractdataframe/subset.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index ca8ea52a1a..b06dfeb80a 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -165,6 +165,7 @@ end row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf) res = view ? Base.view(df, row_selector, :) : df[row_selector, :] + # TODO: in some cases it might be faster to groupby gdf.groups[row_selector] return ungroup ? res : groupby(res, groupcols(gdf)) end @@ -277,5 +278,6 @@ function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Boo row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf) res = delete!(df, findall(!, row_selector)) + # TODO: in some cases it might be faster to groupby gdf.groups[row_selector] return ungroup ? res : groupby(res, groupcols(gdf)) end From 6704541c5db9bc68500f3955841c73382705088c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 9 Jan 2021 23:29:07 +0100 Subject: [PATCH 16/16] remove @inline --- src/abstractdataframe/subset.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index b06dfeb80a..6f0cdfca31 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -153,14 +153,14 @@ julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) 2 │ 4 false false missing 12 ``` """ -@inline function subset(df::AbstractDataFrame, @nospecialize(args...); - skipmissing::Bool=false, view::Bool=false) +function subset(df::AbstractDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false) row_selector = _get_subset_conditions(df, args, skipmissing) return view ? Base.view(df, row_selector, :) : df[row_selector, :] end -@inline function subset(gdf::GroupedDataFrame, @nospecialize(args...); - skipmissing::Bool=false, view::Bool=false, +function subset(gdf::GroupedDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false, ungroup::Bool=true) row_selector = _get_subset_conditions(gdf, args, skipmissing) df = parent(gdf)