From c3083fc7d3fa8cfbf3eab0d3dff28daa5acf7159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 4 May 2021 22:12:16 +0200 Subject: [PATCH] require AbstractVector from subset selectors (#2744) --- NEWS.md | 25 ++- Project.toml | 2 +- src/abstractdataframe/subset.jl | 31 +++- test/grouping.jl | 232 -------------------------- test/runtests.jl | 1 + test/subset.jl | 280 ++++++++++++++++++++++++++++++++ 6 files changed, 331 insertions(+), 240 deletions(-) create mode 100644 test/subset.jl diff --git a/NEWS.md b/NEWS.md index 439a4ae0eb..391d8758f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,23 @@ -# DataFrames v1.0 Release Notes +# DataFrames.jl v1.1 Release Notes + +## Functionality changes + +* make sure `subset` checks if the passed condition function + returns a vector of values (in the 1.0 release also returning scalar `true`, + `false`, or `missing` was allowed which was unintended and error prone) + ([#2744](https://github.com/JuliaData/DataFrames.jl/pull/2744)) + + +# DataFrames.jl v1.0.2 Patch Release Notes + +## Performance improvements + +* fix of performance issue of `groupby` when using multi-threading + ([#2736](https://github.com/JuliaData/DataFrames.jl/pull/2736)) +* fix of performance issue of `groupby` when using `PooledVector` + ([2733](https://github.com/JuliaData/DataFrames.jl/pull/2733)) + +# DataFrames.jl v1.0 Release Notes ## Breaking changes @@ -76,7 +95,7 @@ [#2574](https://github.com/JuliaData/DataFrames.jl/pull/2574), [#2664](https://github.com/JuliaData/DataFrames.jl/pull/2664)) -# DataFrames v0.22.7 Release notes +# DataFrames.jl v0.22.7 Release notes * `convert` methods from `AbstractDataFrame`, `DataFrameRow` and `GroupKey` to `Array`, `Matrix`, `Vector` and `Tuple`, as well as from `AbstractDict` to @@ -95,7 +114,7 @@ `Tables.CopiedColumns` ([#2656](https://github.com/JuliaData/DataFrames.jl/pull/2656)) -# DataFrames v0.22 Release Notes +# DataFrames.jl v0.22 Release Notes ## Breaking changes diff --git a/Project.toml b/Project.toml index 8ade6d8eea..3dfa72611f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DataFrames" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.0.2" +version = "1.1.0" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl index 4f8ba0647a..3b4e3c7f5c 100644 --- a/src/abstractdataframe/subset.jl +++ b/src/abstractdataframe/subset.jl @@ -31,6 +31,20 @@ function _and_missing(x::Any...) "but only true, false, or missing are allowed")) end +# we are guaranteed that ByRow returns a vector +# this workaround is needed for 0-argument ByRow +assert_bool_vec(fun::ByRow) = fun + +function assert_bool_vec(@nospecialize(fun)) + return function(x...) + val = fun(x...) + if !(val isa AbstractVector) + throw(ArgumentError("functions passed to `subset` must return an AbstractVector.")) + end + return val + end +end + # Note that _get_subset_conditions will have a large compilation time # if more than 32 conditions are passed as `args`. function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, @@ -39,7 +53,7 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, conditions = Any[if a isa ColumnIndex a => Symbol(:x, i) elseif a isa Pair{<:Any, <:Base.Callable} - first(a) => last(a) => Symbol(:x, i) + first(a) => assert_bool_vec(last(a)) => Symbol(:x, i) else throw(ArgumentError("condition specifier $a is not supported by `subset`")) end for (i, a) in enumerate(args)] @@ -71,13 +85,16 @@ end Return a copy of data frame `df` or parent of `gdf` containing only rows for which all values produced by transformation(s) `args` for a given row are `true`. +All transformations must produce vectors containing `true` or `false` (and +optionally `missing` if `skipmissing=true`). Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules described for [`select`](@ref). Note that as opposed to [`filter`](@ref) the `subset` function works on whole -columns (or all rows in groups for `GroupedDataFrame`). +columns (and selects rows within groups for `GroupedDataFrame` +rather than whole groups) and must return a vector. If `skipmissing=false` (the default) `args` are required to produce vectors containing only `Bool` values. If `skipmissing=true`, additionally `missing` is @@ -174,13 +191,16 @@ end Update data frame `df` or the parent of `gdf` in place to contain only rows for which all values produced by transformation(s) `args` for a given row is `true`. +All transformations must produce vectors containing `true` or `false` (and +optionally `missing` if `skipmissing=true`). Each argument passed in `args` can be either a single column selector or a `source_columns => function` transformation specifier following the rules described for [`select`](@ref). Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole -columns (or all rows in groups for `GroupedDataFrame`). +columns (and selects rows within groups for `GroupedDataFrame` rather than whole +groups) and must return a vector. If `skipmissing=false` (the default) `args` are required to produce vectors containing only `Bool` values. If `skipmissing=true`, additionally `missing` is @@ -191,7 +211,10 @@ If `ungroup=false` the resulting data frame is re-grouped based on the same grouping columns as `gdf` and a `GroupedDataFrame` is returned. If `GroupedDataFrame` is subsetted then it must include all groups present in the -`parent` data frame, like in [`select!`](@ref). +`parent` data frame, like in [`select!`](@ref). Also note that in general +the parent of the passed `GroupedDataFrame` is mutated in place, which means +that the `GroupedDataFrame` is not safe to use as most likely it will +get corrupted due to row removal in the parent. See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref) diff --git a/test/grouping.jl b/test/grouping.jl index f58cf61709..fda21a9a97 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3482,238 +3482,6 @@ end @test df == df2 end -@testset "subset and subset!" begin - refdf = DataFrame(x = repeat(Any[true, false], 4), - y = repeat([true, false, missing, missing], 2), - z = repeat([1, 2, 3, 3], 2), - id = 1:8) - - for df in (copy(refdf), @view copy(refdf)[1:end-1, :]) - df2 = copy(df) - @test subset(df, :x) ≅ filter(:x => identity, df) - @test df ≅ df2 - @test subset(df, :x) isa DataFrame - @test subset(df, :x, view=true) ≅ filter(:x => identity, df) - @test subset(df, :x, view=true) isa SubDataFrame - @test_throws ArgumentError subset(df, :y) - @test_throws ArgumentError subset(df, :y, :x) - @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) - @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) - @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) - @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) - @test subset(df, :x, :y, skipmissing=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(df, :y, :x, skipmissing=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(df, :x, :y, skipmissing=true, view=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) - @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) - @test subset(df, :x, :id => ByRow(<(4))) ≅ - filter([:x, :id] => (x, id) -> x && id < 4, df) - @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅ - filter([:x, :id] => (x, id) -> x && id < 4, df) - @test_throws ArgumentError subset(df) - @test isempty(subset(df, :x, :x => ByRow(!))) - @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing) - @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing) - @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2) - end - - for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), - gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]]) - df2 = copy(df) - @test subset(gdf, :x) ≅ filter(:x => identity, df) - @test df ≅ df2 - @test subset(gdf, :x) isa DataFrame - @test subset(gdf, :x, ungroup=false) ≅ - groupby_checked(filter(:x => identity, df), :z) - @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame} - @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) - @test subset(gdf, :x, view=true) isa SubDataFrame - @test subset(gdf, :x, view=true, ungroup=false) ≅ - groupby_checked(filter(:x => identity, df), :z) - @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame} - @test_throws ArgumentError subset(gdf, :y) - @test_throws ArgumentError subset(gdf, :y, :x) - @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) - @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) - @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) - @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) - @test subset(gdf, :x, :y, skipmissing=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(gdf, :y, :x, skipmissing=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅ - filter([:x, :y] => (x, y) -> x && y === true, df) - @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) - @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) - @test subset(gdf, :x, :id => ByRow(<(4))) ≅ - filter([:x, :id] => (x, id) -> x && id < 4, df) - @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅ - filter([:x, :id] => (x, id) -> x && id < 4, df) - @test_throws ArgumentError subset(gdf) - @test isempty(subset(gdf, :x, :x => ByRow(!))) - @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing) - @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing) - @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2) - end - - df = copy(refdf) - @test subset!(df, :x) === df - @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf) - df = copy(refdf) - @test_throws ArgumentError subset!(df, :y) - @test df ≅ refdf - df = copy(refdf) - @test subset!(df, :y, skipmissing=true) === df - @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) - df = copy(refdf) - @test subset!(df, :x, :y, skipmissing=true) === df - @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅ - filter([:x, :y] => (x, y) -> x && y === true, refdf) - df = copy(refdf) - @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) - df = copy(refdf) - @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅ - filter([:x, :id] => (x, id) -> x && id < 4, refdf) - df = copy(refdf) - @test_throws ArgumentError subset!(df) - df = copy(refdf) - @test isempty(subset!(df, :x, :x => ByRow(!))) - @test isempty(df) - - df = copy(refdf) - @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing) - @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing) - @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) - - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x) === df - - df = copy(refdf) - gdf = groupby_checked(df, :z) - gdf2 = subset!(gdf, :x, ungroup=false) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test parent(gdf2) === df - @test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z) - - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test_throws ArgumentError subset!(gdf, :y) - @test df ≅ refdf - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :y, skipmissing=true) === df - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x, :y, skipmissing=true) === df - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ - filter([:x, :y] => (x, y) -> x && y === true, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ - filter([:x, :id] => (x, id) -> x && id < 4, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test_throws ArgumentError subset!(gdf) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test isempty(subset!(gdf, :x, :x => ByRow(!))) - @test isempty(df) - df = copy(refdf) - gdf = groupby_checked(df, :z) - @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) - @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) - @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) - - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test_throws ArgumentError subset!(gdf, :y) - @test df ≅ refdf - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ - filter([:x, :y] => (x, y) -> x && y === true, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ - filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ - filter([:x, :id] => (x, id) -> x && id < 4, refdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test_throws ArgumentError subset!(gdf) - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test isempty(subset!(gdf, :x, :x => ByRow(!))) - @test isempty(df) - - df = copy(refdf) - gdf = groupby_checked(df, :z)[[3, 2, 1]] - @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) - @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) - @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) - - @test_throws ArgumentError subset!(view(refdf, :, :), :x) - @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x) - - df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9) - @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4) - @test subset(groupby_checked(df, :g), :x => x -> x .< mean(x)) == - DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7]) - - @test_throws ArgumentError subset(df, :x => x -> missing) - @test isempty(subset(df, :x => x -> missing, skipmissing=true)) - @test isempty(subset(df, :x => x -> false)) - @test subset(df, :x => x -> true) ≅ df - @test_throws ArgumentError subset(df, :x => x -> (a=x,)) - @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable) - - @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y) - @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y) - @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x) - @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y) - @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y) - @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) - @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x) - @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) - - @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true) - @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true) - @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true) - @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true) - - @test_throws ArgumentError DataFrames._and() - @test_throws ArgumentError DataFrames._and_missing() -end - @testset "make sure we handle idx correctly when groups are reordered" begin df = DataFrame(g=[2, 2, 1, 1, 1], id = 1:5) @test select(df, :g, :id, :id => ByRow(identity) => :id2) == diff --git a/test/runtests.jl b/test/runtests.jl index a2f9ff1fc2..b5ab3f02b1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -29,6 +29,7 @@ my_tests = ["utils.jl", "conversions.jl", "sort.jl", "grouping.jl", + "subset.jl", "join.jl", "iteration.jl", "duplicates.jl", diff --git a/test/subset.jl b/test/subset.jl new file mode 100644 index 0000000000..dadbf7c119 --- /dev/null +++ b/test/subset.jl @@ -0,0 +1,280 @@ +module TestSubset + +using Test, DataFrames, Statistics + +const ≅ = isequal + +@testset "subset and subset!" begin + refdf = DataFrame(x = repeat(Any[true, false], 4), + y = repeat([true, false, missing, missing], 2), + z = repeat([1, 2, 3, 3], 2), + id = 1:8) + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]) + df2 = copy(df) + @test subset(df, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 + @test subset(df, :x) isa DataFrame + @test subset(df, :x, view=true) ≅ filter(:x => identity, df) + @test subset(df, :x, view=true) isa SubDataFrame + @test_throws ArgumentError subset(df, :y) + @test_throws ArgumentError subset(df, :y, :x) + @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(df) + @test isempty(subset(df, :x, :x => ByRow(!))) + @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2) + end + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), + gdf in (groupby(df, :z), groupby(df, :z)[[3, 2, 1]]) + df2 = copy(df) + @test subset(gdf, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 + @test subset(gdf, :x) isa DataFrame + @test subset(gdf, :x, ungroup=false) ≅ + groupby(filter(:x => identity, df), :z) + @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame} + @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) + @test subset(gdf, :x, view=true) isa SubDataFrame + @test subset(gdf, :x, view=true, ungroup=false) ≅ + groupby(filter(:x => identity, df), :z) + @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame} + @test_throws ArgumentError subset(gdf, :y) + @test_throws ArgumentError subset(gdf, :y, :x) + @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(gdf) + @test isempty(subset(gdf, :x, :x => ByRow(!))) + @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2) + end + + df = copy(refdf) + @test subset!(df, :x) === df + @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df, :y) + @test df ≅ refdf + df = copy(refdf) + @test subset!(df, :y, skipmissing=true) === df + @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, skipmissing=true) === df + @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df) + df = copy(refdf) + @test isempty(subset!(df, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) + + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x) === df + + df = copy(refdf) + gdf = groupby(df, :z) + gdf2 = subset!(gdf, :x, ungroup=false) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test parent(gdf2) === df + @test gdf2 ≅ groupby(df, :z) ≅ groupby(filter(:x => identity, refdf), :z) + + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby(df, :z) + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + df = copy(refdf) + gdf = groupby(df, :z) + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) + + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + gdf = groupby(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) + + @test_throws ArgumentError subset!(view(refdf, :, :), :x) + @test_throws ArgumentError subset!(groupby(view(refdf, :, :), :z), :x) + + df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9) + @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4) + @test subset(groupby(df, :g), :x => x -> x .< mean(x)) == + DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7]) + + @test_throws ArgumentError subset(df, :x => x -> missing) + @test isempty(subset(df, :x => ByRow(x -> missing), skipmissing=true)) + @test_throws ArgumentError isempty(subset(df, :x => x -> missing, skipmissing=true)) + @test isempty(subset(df, :x => ByRow(x -> false))) + @test_throws ArgumentError isempty(subset(df, :x => x -> false)) + @test subset(df, :x => ByRow(x -> true)) ≅ df + @test_throws ArgumentError subset(df, :x => x -> true) ≅ df + @test_throws ArgumentError subset(df, :x => x -> (a=x,)) + @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable) + + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x) + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) + + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true) + + @test_throws ArgumentError DataFrames._and() + @test_throws ArgumentError DataFrames._and_missing() +end + +@testset "subsetting requires passing vector" begin + @test_throws ArgumentError subset(DataFrame(x=[]), :x => x -> 1) + @test_throws AssertionError subset(DataFrame(x=[]), :x => ByRow(x -> 1)) + @test_throws ArgumentError subset(DataFrame(x=1:3), [] => () -> true) + @test subset(DataFrame(x=1:3), [] => ByRow(() -> true)) == DataFrame(x=1:3) + @test_throws ArgumentError subset(DataFrame(x = [0, 1]), :x => ==(0)) + @test_throws ArgumentError subset(DataFrame(x = [0, missing]), :x => ismissing) + @test_throws ArgumentError subset(groupby(DataFrame(id = [0, 0, 1, 1], + x = [-1, 1, 3, 4]), :id), + :x => (x -> sum(x) > 0)) + @test_throws ArgumentError subset(DataFrame(x=1:3), :x => x -> fill(missing, length(x))) + @test isempty(subset(DataFrame(x=1:3), :x => x -> fill(missing, length(x)), + skipmissing=true)) + @test subset(DataFrame(x=1:3), :x => x -> fill(true, length(x))) == DataFrame(x=1:3) + @test subset(DataFrame(x=1:3), :x => x -> trues(length(x))) == DataFrame(x=1:3) + @test isempty(subset(DataFrame(x=1:3), :x => x -> fill(false, length(x)))) + @test isempty(subset(DataFrame(x=1:3), :x => x -> falses(length(x)))) + @test_throws AssertionError subset(DataFrame(), [] => () -> Union{}[]) + @test_throws AssertionError subset(DataFrame(), [] => () -> Union{}[], skipmissing=true) + @test subset(DataFrame(x=1:3:15, y=1:5), [:x, :y] => (x, y) -> iseven.(x) .& iseven.(y)) == + DataFrame(x=[4, 10], y=[2, 4]) + @test subset(DataFrame(x=1:3:15, y=1:5), + AsTable([:x, :y]) => v -> iseven.(v.x) .& iseven.(v.y)) == + DataFrame(x=[4, 10], y=[2, 4]) + + @test subset(DataFrame(x=1:3), :x => x -> Any[true, false, true]) == + DataFrame(x=[1, 3]) + @test subset(DataFrame(x=1:3), :x => x -> view(Any[true, false, true], :)) == + DataFrame(x=[1, 3]) + @test_throws ArgumentError subset(DataFrame(x=1:3), :x => x -> Any[true, false, missing]) + @test subset(DataFrame(x=1:3), :x => x -> Any[true, false, missing], skipmissing=true) == + DataFrame(x=[1]) + @test_throws ArgumentError subset(DataFrame(x=1:3), :x => x -> Any[true, false, 1]) + @test_throws ArgumentError subset(DataFrame(x=1:3), :x => x -> Any[true, false, 1], + skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1:3), :x => x -> (true for i in 1:3)) +end + +end