From 4342310af12ed12a3c1a48bf6128e546ef41ec31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 14 Dec 2020 13:44:44 +0100
Subject: [PATCH 01/16] add subset and subset!

---
 NEWS.md                         |   2 +
 docs/src/lib/functions.md       |   2 +
 docs/src/man/comparisons.md     |   4 +-
 src/DataFrames.jl               |   3 +
 src/abstractdataframe/subset.jl | 231 ++++++++++++++++++++++++++++++++
 test/grouping.jl                | 162 ++++++++++++++++++++++
 6 files changed, 402 insertions(+), 2 deletions(-)
 create mode 100644 src/abstractdataframe/subset.jl

diff --git a/NEWS.md b/NEWS.md
index e9e9295f88..449f3e244f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,8 @@
 
 ## New functionalities
 
+* add `subset` and `subset!` functions that allow to subset rows
+  ([#2496](https://github.com/JuliaData/DataFrames.jl/pull/2496))
 
 ## Deprecated
 
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 128f0ac9e7..d6d327c94c 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -102,6 +102,8 @@ first
 last
 only
 nonunique
+subset
+subset!
 unique
 unique!
 ```
diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md
index 64ba636af1..f8bbf6f470 100644
--- a/docs/src/man/comparisons.md
+++ b/docs/src/man/comparisons.md
@@ -204,7 +204,7 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9,
 | Rename columns           | `rename(df, x_new = x)`        | `rename(df, :x => :x_new)`             |
 | Pick columns             | `select(df, x, y)`             | `select(df, :x, :y)`                   |
 | Pick & transform columns | `transmute(df, mean(x), y)`    | `select(df, :x => mean, :y)`           |
-| Pick rows                | `filter(df, x >= 1)`           | `filter(:x => >=(1), df)`              |
+| Pick rows                | `filter(df, x >= 1)`           | `subset(df, :x => ByRow(>=(1)))`       |
 | Sort rows                | `arrange(df, x)`               | `sort(df, :x)`                         |
 
 As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group:
@@ -240,7 +240,7 @@ The following table compares the main functions of DataFrames.jl with Stata:
 | Add new columns        | `egen x_mean = mean(x)` | `transform!(df, :x => mean => :x_mean)` |
 | Rename columns         | `rename x x_new`        | `rename!(df, :x => :x_new)`             |
 | Pick columns           | `keep x y`              | `select!(df, :x, :y)`                   |
-| Pick rows              | `keep if x >= 1`        | `filter!(:x => >=(1), df)`              |
+| Pick rows              | `keep if x >= 1`        | `subset!(df, :x => ByRow(>=(1))`        |
 | Sort rows              | `sort x`                | `sort!(df, :x)`                         |
 
 Note that the suffix `!` (i.e. `transform!`, `select!`, etc) ensures that the operation transforms the dataframe in place, as in Stata
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index 0715c40a3e..7417259461 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -64,6 +64,8 @@ export AbstractDataFrame,
        select,
        semijoin,
        stack,
+       subset,
+       subset!,
        transform,
        transform!,
        unique!,
@@ -104,6 +106,7 @@ include("dataframerow/utils.jl")
 include("other/broadcasting.jl")
 
 include("abstractdataframe/selection.jl")
+include("abstractdataframe/subset.jl")
 include("abstractdataframe/iteration.jl")
 include("abstractdataframe/join.jl")
 include("abstractdataframe/reshape.jl")
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
new file mode 100644
index 0000000000..7114cd19a9
--- /dev/null
+++ b/src/abstractdataframe/subset.jl
@@ -0,0 +1,231 @@
+# subset allows a transformation specification without a target column name or a column
+
+_process_subset_pair(i::Int, a::ColumnIndex) = a => Symbol(:x, i)
+_process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) =
+    first(a) => last(a) => Symbol(:x, i)
+_process_subset_pair(i::Int, a) =
+    throw(ArgumentError("condition specifier $a is not supported by `subset`"))
+
+function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
+                               @nospecialize(args), skipmissing::Bool)
+    conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
+
+    isempty(conditions) && throw(ArgumentError("at least one condition must be passed"))
+
+    if df isa AbstractDataFrame
+        df_conditions = select(df, conditions..., copycols=!(df isa DataFrame))
+    else
+        df_conditions = select(df, conditions...,
+                               copycols=!(parent(df) isa DataFrame), keepkeys=false)
+    end
+    test_type = skipmissing ? Union{Missing, Bool} : Bool
+    for col in eachcol(df_conditions)
+        if !(eltype(col) <: test_type)
+            throw(ArgumentError("each transformation must produce a vector whose " *
+                                "eltype is subtype of $test_type"))
+        end
+    end
+    @assert ncol(df_conditions) == length(conditions)
+    if ncol(df_conditions) == 1
+        return .===(df_conditions[!, 1], true)
+    else
+        return .===(.&(eachcol(df_conditions)...), true)
+    end
+end
+
+"""
+    subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
+    subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
+
+Return a copy of data frame `df` or parent of `gdf` containing only rows for
+which all values produced by transformation(s) `args` for a given row is `true`.
+
+If `skipmissing=true` returing `missing` in `args` is allowed and these rows are
+dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
+present.
+
+Each argument passed in `args` can be either a single column selector or a
+`source_columns => function` transformation specifier following the rules
+described for [`select`](@ref).
+
+Note that as opposed to [`filter`](@ref) the `subset` function works on whole
+columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows
+for which contition is false.
+
+If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
+
+If `GroupedDataFrame` is subsetted then it must include all groups present in the
+`parent` data frame, like in [`select!`](@ref).
+
+See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref)
+
+# Examples
+
+```
+julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
+                      z=[true, true, missing, missing], v=[1, 2, 11, 12])
+4×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 1     │ 1    │ 1    │ 1       │ 1     │
+│ 2   │ 2     │ 0    │ 1    │ 1       │ 2     │
+│ 3   │ 3     │ 1    │ 0    │ missing │ 11    │
+│ 4   │ 4     │ 0    │ 0    │ missing │ 12    │
+
+julia> subset(df, :x)
+2×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 1     │ 1    │ 1    │ 1       │ 1     │
+│ 2   │ 3     │ 1    │ 0    │ missing │ 11    │
+
+julia> subset(df, :v => x -> x .> 3)
+2×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 3     │ 1    │ 0    │ missing │ 11    │
+│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+
+julia> subset(df, :x, :y => ByRow(!))
+1×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 3     │ 1    │ 0    │ missing │ 11    │
+
+julia> subset(df, :x, :z, skipmissing=true)
+1×5 DataFrame
+│ Row │ id    │ x    │ y    │ z     │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool? │ Int64 │
+├─────┼───────┼──────┼──────┼───────┼───────┤
+│ 1   │ 1     │ 1    │ 1    │ 1     │ 1     │
+
+julia> subset(df, :x, :z)
+ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
+
+julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
+2×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 2     │ 0    │ 1    │ 1       │ 2     │
+│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+```
+"""
+@inline function subset(df::AbstractDataFrame, @nospecialize(args...);
+                        skipmissing::Bool=false, view::Bool=false)
+    row_selector = _get_subset_conditions(df, args, skipmissing)
+    return view ? Base.view(df, row_selector, :) : df[row_selector, :]
+end
+
+@inline function subset(gdf::GroupedDataFrame, @nospecialize(args...);
+                        skipmissing::Bool=false, view::Bool=false)
+    row_selector = _get_subset_conditions(gdf, args, skipmissing)
+    df = parent(gdf)
+    return view ? Base.view(df, row_selector, :) : df[row_selector, :]
+end
+
+"""
+    subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false)
+    subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false)
+
+Update data frame `df` or the parent of `gdf` in place to contain only rows for
+which all values produced by transformation(s) `args` for a given row is `true`.
+
+If `skipmissing=true` returing `missing` in `args` is allowed and these rows are
+dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
+present.
+
+Each argument passed in `args` can be either a single column selector or a
+`source_columns => function` transformation specifier following the rules
+described for [`select`](@ref).
+
+Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
+columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows
+for which contition is false.
+
+If `GroupedDataFrame` is subsetted then it must include all groups present in the
+`parent` data frame, like in [`select!`](@ref).
+
+See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
+
+# Examples
+
+```
+julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false])
+4×3 DataFrame
+│ Row │ id    │ x    │ y    │
+│     │ Int64 │ Bool │ Bool │
+├─────┼───────┼──────┼──────┤
+│ 1   │ 1     │ 1    │ 1    │
+│ 2   │ 2     │ 0    │ 1    │
+│ 3   │ 3     │ 1    │ 0    │
+│ 4   │ 4     │ 0    │ 0    │
+
+julia> subset!(copy(df), :x, :y => ByRow(!));
+
+julia> df
+4×3 DataFrame
+│ Row │ id    │ x    │ y    │
+│     │ Int64 │ Bool │ Bool │
+├─────┼───────┼──────┼──────┤
+│ 1   │ 1     │ 1    │ 1    │
+│ 2   │ 2     │ 0    │ 1    │
+│ 3   │ 3     │ 1    │ 0    │
+│ 4   │ 4     │ 0    │ 0    │
+
+julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]);
+
+julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));
+
+julia> df
+2×3 DataFrame
+│ Row │ id    │ y    │ v     │
+│     │ Int64 │ Bool │ Int64 │
+├─────┼───────┼──────┼───────┤
+│ 1   │ 2     │ 1    │ 2     │
+│ 2   │ 4     │ 0    │ 12    │
+
+julia> df = DataFrame(id=1:4, x=[true, false, true, false],
+                             z=[true, true, missing, missing], v=1:4);
+
+julia> subset!(df, :x, :z)
+ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
+
+julia> subset!(df, :x, :z, skipmissing=true);
+
+julia> df
+1×4 DataFrame
+│ Row │ id    │ x    │ z     │ v     │
+│     │ Int64 │ Bool │ Bool? │ Int64 │
+├─────┼───────┼──────┼───────┼───────┤
+│ 1   │ 1     │ 1    │ 1     │ 1     │
+
+julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
+                             z=[true, true, missing, missing], v=[1, 2, 11, 12]);
+
+julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));
+
+julia> df
+2×5 DataFrame
+│ Row │ id    │ x    │ y    │ z       │ v     │
+│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
+├─────┼───────┼──────┼──────┼─────────┼───────┤
+│ 1   │ 2     │ 0    │ 1    │ 1       │ 2     │
+│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+```
+"""
+function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false)
+    row_selector = _get_subset_conditions(df, args, skipmissing)
+    return delete!(df, findall(!, row_selector))
+end
+
+function subset!(gdf::GroupedDataFrame, @nospecialize(args...);
+                skipmissing::Bool=false)
+    row_selector = _get_subset_conditions(gdf, args, skipmissing)
+    df = parent(gdf)
+    return delete!(df, findall(!, row_selector))
+end
diff --git a/test/grouping.jl b/test/grouping.jl
index a2df4868ae..0e741ff351 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3272,4 +3272,166 @@ end
     @test df == df2
 end
 
+@testset "subset and subset!" begin
+    refdf = DataFrame(x = repeat([true, false], 4),
+                      y = repeat([true, false, missing, missing], 2),
+                      z = repeat([1, 2, 3, 3], 2),
+                      id = 1:8)
+
+    for df in (copy(refdf), @view copy(refdf)[1:end-1, :])
+        @test subset(df, :x) ≅ filter(:x => identity, df)
+        @test subset(df, :x) isa DataFrame
+        @test subset(df, :x, view=true) ≅ filter(:x => identity, df)
+        @test subset(df, :x, view=true) isa SubDataFrame
+        @test_throws ArgumentError subset(df, :y)
+        @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(df, :x, :y, skipmissing=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(df, :x, :y, skipmissing=true, view=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
+              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
+        @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅
+              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
+        @test subset(df, :x, :id => ByRow(<(4))) ≅
+              filter([:x, :id] => (x, id) -> x && id < 4, df)
+        @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅
+              filter([:x, :id] => (x, id) -> x && id < 4, df)
+        @test_throws ArgumentError subset(df)
+        @test isempty(subset(df, :x, :x => ByRow(!)))
+    end
+
+    for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
+        gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]])
+        @test subset(gdf, :x) ≅ filter(:x => identity, df)
+        @test subset(gdf, :x) isa DataFrame
+        @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df)
+        @test subset(gdf, :x, view=true) isa SubDataFrame
+        @test_throws ArgumentError subset(gdf, :y)
+        @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(gdf, :x, :y, skipmissing=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
+              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
+        @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅
+              filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df)
+        @test subset(gdf, :x, :id => ByRow(<(4))) ≅
+              filter([:x, :id] => (x, id) -> x && id < 4, df)
+        @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅
+              filter([:x, :id] => (x, id) -> x && id < 4, df)
+        @test_throws ArgumentError subset(gdf)
+        @test isempty(subset(gdf, :x, :x => ByRow(!)))
+    end
+
+    df = copy(refdf)
+    @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf)
+    df = copy(refdf)
+    @test_throws ArgumentError subset!(df, :y)
+    @test df ≅ refdf
+    df = copy(refdf)
+    @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
+    df = copy(refdf)
+    @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅
+          filter([:x, :y] => (x, y) -> x && y === true, refdf)
+    df = copy(refdf)
+    @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
+          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
+    df = copy(refdf)
+    @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅
+          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
+    df = copy(refdf)
+    @test_throws ArgumentError subset!(df)
+    df = copy(refdf)
+    @test isempty(subset!(df, :x, :x => ByRow(!)))
+    @test isempty(df)
+
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test_throws ArgumentError subset!(gdf, :y)
+    @test df ≅ refdf
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
+          filter([:x, :y] => (x, y) -> x && y === true, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
+          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
+          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test_throws ArgumentError subset!(gdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test isempty(subset!(gdf, :x, :x => ByRow(!)))
+    @test isempty(df)
+
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test_throws ArgumentError subset!(gdf, :y)
+    @test df ≅ refdf
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
+          filter([:x, :y] => (x, y) -> x && y === true, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
+          filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
+          filter([:x, :id] => (x, id) -> x && id < 4, refdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test_throws ArgumentError subset!(gdf)
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test isempty(subset!(gdf, :x, :x => ByRow(!)))
+    @test isempty(df)
+
+    @test_throws ArgumentError subset!(view(refdf, :, :), :x)
+    @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x)
+
+    df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9)
+    @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4)
+    @test subset(groupby_checked(df, :g), :x => x -> x .< mean(x)) ==
+          DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7])
+
+    @test_throws ArgumentError subset(df, :x => x -> missing)
+    @test isempty(subset(df, :x => x -> missing, skipmissing=true))
+    @test isempty(subset(df, :x => x -> false))
+    @test subset(df, :x => x -> true) ≅ df
+    @test_throws ArgumentError subset(df, :x => x -> (a=x,))
+    @test_throws ArgumentError subset(df, :x => x -> (a=x,) => AsTable)
+end
+
+@testset "make sure we handle idx correctly when groups are reordered" begin
+    df = DataFrame(g=[2, 2, 1, 1, 1], id = 1:5)
+    @test select(df, :g, :id, :id => ByRow(identity) => :id2) ==
+          select(groupby_checked(df, :g), :id, :id => ByRow(identity) => :id2) ==
+          select(groupby_checked(df, :g, sort=true), :id, :id => ByRow(identity) => :id2) ==
+          select(groupby_checked(df, :g)[[2,1]], :id, :id => ByRow(identity) => :id2) ==
+          [df DataFrame(id2=df.id)]
+end
+
 end # module

From be7c506410dc0ccad65685945d50b0c8ff0ad6a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 14 Dec 2020 15:52:10 +0100
Subject: [PATCH 02/16] fix typo in test

---
 test/grouping.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/grouping.jl b/test/grouping.jl
index 0e741ff351..4ae2ecb553 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3422,7 +3422,7 @@ end
     @test isempty(subset(df, :x => x -> false))
     @test subset(df, :x => x -> true) ≅ df
     @test_throws ArgumentError subset(df, :x => x -> (a=x,))
-    @test_throws ArgumentError subset(df, :x => x -> (a=x,) => AsTable)
+    @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable)
 end
 
 @testset "make sure we handle idx correctly when groups are reordered" begin

From 6c4cf8c0cb12e4531690bed357482ed801847c34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 20 Dec 2020 16:50:39 +0100
Subject: [PATCH 03/16] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/subset.jl | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 7114cd19a9..aad3fa6fd8 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -7,7 +7,7 @@ _process_subset_pair(i::Int, a) =
     throw(ArgumentError("condition specifier $a is not supported by `subset`"))
 
 function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
-                               @nospecialize(args), skipmissing::Bool)
+                                @nospecialize(args), skipmissing::Bool)
     conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
 
     isempty(conditions) && throw(ArgumentError("at least one condition must be passed"))
@@ -38,9 +38,9 @@ end
     subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
 
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
-which all values produced by transformation(s) `args` for a given row is `true`.
+which all values produced by transformation(s) `args` for a given row are `true`.
 
-If `skipmissing=true` returing `missing` in `args` is allowed and these rows are
+If `skipmissing=true`, returning `missing` in `args` is allowed and corresponding rows are
 dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
 present.
 
@@ -50,14 +50,14 @@ described for [`select`](@ref).
 
 Note that as opposed to [`filter`](@ref) the `subset` function works on whole
 columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows
-for which contition is false.
+for which condition is false.
 
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 
-If `GroupedDataFrame` is subsetted then it must include all groups present in the
+If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
-See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref)
+See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref),  [`select`](@ref)
 
 # Examples
 
@@ -190,7 +190,7 @@ julia> df
 │ 2   │ 4     │ 0    │ 12    │
 
 julia> df = DataFrame(id=1:4, x=[true, false, true, false],
-                             z=[true, true, missing, missing], v=1:4);
+                      z=[true, true, missing, missing], v=1:4);
 
 julia> subset!(df, :x, :z)
 ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
@@ -205,7 +205,7 @@ julia> df
 │ 1   │ 1     │ 1    │ 1     │ 1     │
 
 julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
-                             z=[true, true, missing, missing], v=[1, 2, 11, 12]);
+                      z=[true, true, missing, missing], v=[1, 2, 11, 12]);
 
 julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));
 
@@ -223,8 +223,7 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Boo
     return delete!(df, findall(!, row_selector))
 end
 
-function subset!(gdf::GroupedDataFrame, @nospecialize(args...);
-                skipmissing::Bool=false)
+function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false)
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)
     return delete!(df, findall(!, row_selector))

From d614e5594c4e0061bda4825b2af2840d53b313d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 20 Dec 2020 17:44:20 +0100
Subject: [PATCH 04/16] changes after code review

---
 docs/src/man/comparisons.md     |   7 +-
 src/abstractdataframe/subset.jl | 179 +++++++++++++++++++-------------
 test/grouping.jl                |  39 +++++++
 3 files changed, 151 insertions(+), 74 deletions(-)

diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md
index f8bbf6f470..28c819876d 100644
--- a/docs/src/man/comparisons.md
+++ b/docs/src/man/comparisons.md
@@ -204,9 +204,14 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9,
 | Rename columns           | `rename(df, x_new = x)`        | `rename(df, :x => :x_new)`             |
 | Pick columns             | `select(df, x, y)`             | `select(df, :x, :y)`                   |
 | Pick & transform columns | `transmute(df, mean(x), y)`    | `select(df, :x => mean, :y)`           |
-| Pick rows                | `filter(df, x >= 1)`           | `subset(df, :x => ByRow(>=(1)))`       |
+| Pick rows                | `filter(df, x >= 1)`           | `subset(df, :x => ByRow(x -> x >= 1))` |
 | Sort rows                | `arrange(df, x)`               | `sort(df, :x)`                         |
 
+Note that in the table above the `x -> x >= 1` predicate can be more compactly
+written as `=>(1)`. The latter form has an additional benefit that it is
+compiled only once per Julia session (as opposed to `x -> x >= 1` which defines
+a new anonymous function every time it is introduced).
+
 As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group:
 
 | Operation                | dplyr                                      | DataFrames.jl                               |
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index aad3fa6fd8..eddb4f0a9f 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -6,7 +6,7 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) =
 _process_subset_pair(i::Int, a) =
     throw(ArgumentError("condition specifier $a is not supported by `subset`"))
 
-function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
+@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
                                 @nospecialize(args), skipmissing::Bool)
     conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
 
@@ -18,14 +18,42 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
         df_conditions = select(df, conditions...,
                                copycols=!(parent(df) isa DataFrame), keepkeys=false)
     end
-    test_type = skipmissing ? Union{Missing, Bool} : Bool
     for col in eachcol(df_conditions)
-        if !(eltype(col) <: test_type)
+        if !(eltype(col) <: Union{Missing, Bool})
             throw(ArgumentError("each transformation must produce a vector whose " *
-                                "eltype is subtype of $test_type"))
+                                "eltype is subtype of Union{Missing, Bool}"))
         end
     end
+
     @assert ncol(df_conditions) == length(conditions)
+
+    if ncol(df_conditions) == 1
+        cond = df_conditions[!, 1]
+    else
+        cond = .&(eachcol(df_conditions)...)
+    end
+
+    @assert eltype(cond) <: Union{Bool, Missing}
+
+    if skipmissing
+        return coalesce.(cond, false)
+    else
+        # actually currently the inner condition is only evaluated if actually
+        # we have some missings in cond, but it might change in the future so
+        # I leave this check
+        if Missing <: eltype(cond)
+            pos = findfirst(ismissing, cond)
+            if pos !== nothing
+                # note that the user might have passed a GroupedDataFrame but we
+                # then referer to the parent data frame of this GroupedDataFrame
+                throw(ArgumentError("skipmissing=false and in row $pos of the" *
+                                    " passed conditions were evaluated to" *
+                                    " missing value"))
+            end
+        end
+        return cond
+    end
+
     if ncol(df_conditions) == 1
         return .===(df_conditions[!, 1], true)
     else
@@ -40,9 +68,9 @@ end
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
 which all values produced by transformation(s) `args` for a given row are `true`.
 
-If `skipmissing=true`, returning `missing` in `args` is allowed and corresponding rows are
-dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
-present.
+If `skipmissing=true`, producing `missing` in conjunction of results produced by
+`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
+(the default) an error is thrown if `missing` is produced.
 
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
@@ -65,54 +93,54 @@ See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref),  [`select`](@r
 julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
                       z=[true, true, missing, missing], v=[1, 2, 11, 12])
 4×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 1     │ 1    │ 1    │ 1       │ 1     │
-│ 2   │ 2     │ 0    │ 1    │ 1       │ 2     │
-│ 3   │ 3     │ 1    │ 0    │ missing │ 11    │
-│ 4   │ 4     │ 0    │ 0    │ missing │ 12    │
+ Row │ id     x      y      z        v
+     │ Int64  Bool   Bool   Bool?    Int64
+─────┼─────────────────────────────────────
+   1 │     1   true   true     true      1
+   2 │     2  false   true     true      2
+   3 │     3   true  false  missing     11
+   4 │     4  false  false  missing     12
 
 julia> subset(df, :x)
 2×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 1     │ 1    │ 1    │ 1       │ 1     │
-│ 2   │ 3     │ 1    │ 0    │ missing │ 11    │
+ Row │ id     x     y      z        v
+     │ Int64  Bool  Bool   Bool?    Int64
+─────┼────────────────────────────────────
+   1 │     1  true   true     true      1
+   2 │     3  true  false  missing     11
 
 julia> subset(df, :v => x -> x .> 3)
 2×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 3     │ 1    │ 0    │ missing │ 11    │
-│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+ Row │ id     x      y      z        v
+     │ Int64  Bool   Bool   Bool?    Int64
+─────┼─────────────────────────────────────
+   1 │     3   true  false  missing     11
+   2 │     4  false  false  missing     12
 
 julia> subset(df, :x, :y => ByRow(!))
 1×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 3     │ 1    │ 0    │ missing │ 11    │
+ Row │ id     x     y      z        v
+     │ Int64  Bool  Bool   Bool?    Int64
+─────┼────────────────────────────────────
+   1 │     3  true  false  missing     11
 
 julia> subset(df, :x, :z, skipmissing=true)
 1×5 DataFrame
-│ Row │ id    │ x    │ y    │ z     │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool? │ Int64 │
-├─────┼───────┼──────┼──────┼───────┼───────┤
-│ 1   │ 1     │ 1    │ 1    │ 1     │ 1     │
+ Row │ id     x     y     z      v
+     │ Int64  Bool  Bool  Bool?  Int64
+─────┼─────────────────────────────────
+   1 │     1  true  true   true      1
 
 julia> subset(df, :x, :z)
-ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
+ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value
 
 julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
 2×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 2     │ 0    │ 1    │ 1       │ 2     │
-│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+ Row │ id     x      y      z        v
+     │ Int64  Bool   Bool   Bool?    Int64
+─────┼─────────────────────────────────────
+   1 │     2  false   true     true      2
+   2 │     4  false  false  missing     12
 ```
 """
 @inline function subset(df::AbstractDataFrame, @nospecialize(args...);
@@ -135,9 +163,9 @@ end
 Update data frame `df` or the parent of `gdf` in place to contain only rows for
 which all values produced by transformation(s) `args` for a given row is `true`.
 
-If `skipmissing=true` returing `missing` in `args` is allowed and these rows are
-dropped. If `skipmissing=false` (the default) an error is thrown if `missing` is
-present.
+If `skipmissing=true`, producing `missing` in conjunction of results produced by
+`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
+(the default) an error is thrown if `missing` is produced.
 
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
@@ -157,25 +185,22 @@ See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
 ```
 julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false])
 4×3 DataFrame
-│ Row │ id    │ x    │ y    │
-│     │ Int64 │ Bool │ Bool │
-├─────┼───────┼──────┼──────┤
-│ 1   │ 1     │ 1    │ 1    │
-│ 2   │ 2     │ 0    │ 1    │
-│ 3   │ 3     │ 1    │ 0    │
-│ 4   │ 4     │ 0    │ 0    │
+ Row │ id     x      y
+     │ Int64  Bool   Bool
+─────┼─────────────────────
+   1 │     1   true   true
+   2 │     2  false   true
+   3 │     3   true  false
+   4 │     4  false  false
 
-julia> subset!(copy(df), :x, :y => ByRow(!));
+julia> subset!(df, :x, :y => ByRow(!));
 
 julia> df
-4×3 DataFrame
-│ Row │ id    │ x    │ y    │
-│     │ Int64 │ Bool │ Bool │
-├─────┼───────┼──────┼──────┤
-│ 1   │ 1     │ 1    │ 1    │
-│ 2   │ 2     │ 0    │ 1    │
-│ 3   │ 3     │ 1    │ 0    │
-│ 4   │ 4     │ 0    │ 0    │
+1×3 DataFrame
+ Row │ id     x     y
+     │ Int64  Bool  Bool
+─────┼────────────────────
+   1 │     3  true  false
 
 julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]);
 
@@ -183,26 +208,34 @@ julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));
 
 julia> df
 2×3 DataFrame
-│ Row │ id    │ y    │ v     │
-│     │ Int64 │ Bool │ Int64 │
-├─────┼───────┼──────┼───────┤
-│ 1   │ 2     │ 1    │ 2     │
-│ 2   │ 4     │ 0    │ 12    │
+ Row │ id     y      v
+     │ Int64  Bool   Int64
+─────┼─────────────────────
+   1 │     2   true      2
+   2 │     4  false     12
 
 julia> df = DataFrame(id=1:4, x=[true, false, true, false],
-                      z=[true, true, missing, missing], v=1:4);
+                      z=[true, true, missing, missing], v=1:4)
+4×4 DataFrame
+ Row │ id     x      z        v
+     │ Int64  Bool   Bool?    Int64
+─────┼──────────────────────────────
+   1 │     1   true     true      1
+   2 │     2  false     true      2
+   3 │     3   true  missing      3
+   4 │     4  false  missing      4
 
 julia> subset!(df, :x, :z)
-ERROR: ArgumentError: each transformation must return a vector whose eltype is subtype of Bool
+ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value
 
 julia> subset!(df, :x, :z, skipmissing=true);
 
 julia> df
 1×4 DataFrame
-│ Row │ id    │ x    │ z     │ v     │
-│     │ Int64 │ Bool │ Bool? │ Int64 │
-├─────┼───────┼──────┼───────┼───────┤
-│ 1   │ 1     │ 1    │ 1     │ 1     │
+ Row │ id     x     z      v
+     │ Int64  Bool  Bool?  Int64
+─────┼───────────────────────────
+   1 │     1  true   true      1
 
 julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false],
                       z=[true, true, missing, missing], v=[1, 2, 11, 12]);
@@ -211,11 +244,11 @@ julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x));
 
 julia> df
 2×5 DataFrame
-│ Row │ id    │ x    │ y    │ z       │ v     │
-│     │ Int64 │ Bool │ Bool │ Bool?   │ Int64 │
-├─────┼───────┼──────┼──────┼─────────┼───────┤
-│ 1   │ 2     │ 0    │ 1    │ 1       │ 2     │
-│ 2   │ 4     │ 0    │ 0    │ missing │ 12    │
+ Row │ id     x      y      z        v
+     │ Int64  Bool   Bool   Bool?    Int64
+─────┼─────────────────────────────────────
+   1 │     2  false   true     true      2
+   2 │     4  false  false  missing     12
 ```
 """
 function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false)
diff --git a/test/grouping.jl b/test/grouping.jl
index 4ae2ecb553..725ce338ff 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3300,6 +3300,9 @@ end
               filter([:x, :id] => (x, id) -> x && id < 4, df)
         @test_throws ArgumentError subset(df)
         @test isempty(subset(df, :x, :x => ByRow(!)))
+        @test isempty(subset(df, :x => x -> false, :x => x -> missing))
+        @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing)
+        @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2)
     end
 
     for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
@@ -3325,16 +3328,22 @@ end
               filter([:x, :id] => (x, id) -> x && id < 4, df)
         @test_throws ArgumentError subset(gdf)
         @test isempty(subset(gdf, :x, :x => ByRow(!)))
+        @test isempty(subset(gdf, :x => x -> false, :x => x -> missing))
+        @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing)
+        @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2)
     end
 
     df = copy(refdf)
+    @test subset!(df, :x) === df
     @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf)
     df = copy(refdf)
     @test_throws ArgumentError subset!(df, :y)
     @test df ≅ refdf
     df = copy(refdf)
+    @test subset!(df, :y, skipmissing=true) === df
     @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
     df = copy(refdf)
+    @test subset!(df, :x, :y, skipmissing=true) === df
     @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅
           filter([:x, :y] => (x, y) -> x && y === true, refdf)
     df = copy(refdf)
@@ -3349,6 +3358,15 @@ end
     @test isempty(subset!(df, :x, :x => ByRow(!)))
     @test isempty(df)
 
+    df = copy(refdf)
+    @test isempty(subset!(df, :x => x -> false, :x => x -> missing))
+    df = copy(refdf)
+    @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing)
+    @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)
+
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test subset!(gdf, :x) === df
     df = copy(refdf)
     gdf = groupby(df, :z)
     @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
@@ -3358,9 +3376,15 @@ end
     @test df ≅ refdf
     df = copy(refdf)
     gdf = groupby(df, :z)
+    @test subset!(gdf, :y, skipmissing=true) === df
+    df = copy(refdf)
+    gdf = groupby(df, :z)
     @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
     df = copy(refdf)
     gdf = groupby(df, :z)
+    @test subset!(gdf, :x, :y, skipmissing=true) === df
+    df = copy(refdf)
+    gdf = groupby(df, :z)
     @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
           filter([:x, :y] => (x, y) -> x && y === true, refdf)
     df = copy(refdf)
@@ -3378,6 +3402,13 @@ end
     gdf = groupby(df, :z)
     @test isempty(subset!(gdf, :x, :x => ByRow(!)))
     @test isempty(df)
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
+    df = copy(refdf)
+    gdf = groupby(df, :z)
+    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
+    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
 
     df = copy(refdf)
     gdf = groupby(df, :z)[[3, 2, 1]]
@@ -3409,6 +3440,14 @@ end
     @test isempty(subset!(gdf, :x, :x => ByRow(!)))
     @test isempty(df)
 
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
+    df = copy(refdf)
+    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
+    @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
+
     @test_throws ArgumentError subset!(view(refdf, :, :), :x)
     @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x)
 

From 1859935781b322e02983439a70b0d35b8c60446a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 20 Dec 2020 22:33:09 +0100
Subject: [PATCH 05/16] remove dead code

---
 src/abstractdataframe/subset.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index eddb4f0a9f..1d551e91df 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -53,12 +53,6 @@ _process_subset_pair(i::Int, a) =
         end
         return cond
     end
-
-    if ncol(df_conditions) == 1
-        return .===(df_conditions[!, 1], true)
-    else
-        return .===(.&(eachcol(df_conditions)...), true)
-    end
 end
 
 """

From fe29da164c6786f4f755ed80c200fc302b272e3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 Jan 2021 16:51:01 +0100
Subject: [PATCH 06/16] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 docs/src/man/comparisons.md     |  2 +-
 src/abstractdataframe/subset.jl | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md
index 28c819876d..2730b64f61 100644
--- a/docs/src/man/comparisons.md
+++ b/docs/src/man/comparisons.md
@@ -245,7 +245,7 @@ The following table compares the main functions of DataFrames.jl with Stata:
 | Add new columns        | `egen x_mean = mean(x)` | `transform!(df, :x => mean => :x_mean)` |
 | Rename columns         | `rename x x_new`        | `rename!(df, :x => :x_new)`             |
 | Pick columns           | `keep x y`              | `select!(df, :x, :y)`                   |
-| Pick rows              | `keep if x >= 1`        | `subset!(df, :x => ByRow(>=(1))`        |
+| Pick rows              | `keep if x >= 1`        | `subset!(df, :x => ByRow(x -> x >= 1)`        |
 | Sort rows              | `sort x`                | `sort!(df, :x)`                         |
 
 Note that the suffix `!` (i.e. `transform!`, `select!`, etc) ensures that the operation transforms the dataframe in place, as in Stata
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 1d551e91df..3d0e8282d1 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -38,17 +38,17 @@ _process_subset_pair(i::Int, a) =
     if skipmissing
         return coalesce.(cond, false)
     else
-        # actually currently the inner condition is only evaluated if actually
+        # currently the inner condition is only evaluated if actually
         # we have some missings in cond, but it might change in the future so
         # I leave this check
         if Missing <: eltype(cond)
             pos = findfirst(ismissing, cond)
             if pos !== nothing
                 # note that the user might have passed a GroupedDataFrame but we
-                # then referer to the parent data frame of this GroupedDataFrame
-                throw(ArgumentError("skipmissing=false and in row $pos of the" *
+                # then refer to the parent data frame of this GroupedDataFrame
+                throw(ArgumentError("skipmissing=false but at row $pos the" *
                                     " passed conditions were evaluated to" *
-                                    " missing value"))
+                                    " a missing value"))
             end
         end
         return cond
@@ -62,9 +62,12 @@ end
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
 which all values produced by transformation(s) `args` for a given row are `true`.
 
-If `skipmissing=true`, producing `missing` in conjunction of results produced by
-`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
-(the default) an error is thrown if `missing` is produced.
+If `skipmissing=false` (the default), an error is thrown if the conjunction (i.e. `&`)
+of results produced by `args` is `missing`. If `skipmissing=true`, `missing`
+is allowed and corresponding rows are dropped. Note that if some transformations
+in `args` return `false` and others `missing` for a given row, the conjunction is `false`,
+but that if some return `true` and others `missing`, the conjunction is `missing`
+([three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic)).
 
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
@@ -126,7 +129,7 @@ julia> subset(df, :x, :z, skipmissing=true)
    1 │     1  true  true   true      1
 
 julia> subset(df, :x, :z)
-ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value
+ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value
 
 julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
 2×5 DataFrame
@@ -220,7 +223,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false],
    4 │     4  false  missing      4
 
 julia> subset!(df, :x, :z)
-ERROR: ArgumentError: skipmissing=false and in row 3 of the passed conditions were evaluated to missing value
+ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value
 
 julia> subset!(df, :x, :z, skipmissing=true);
 

From 33966dfd625db8f4bbc184dce1615648ca92da6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 Jan 2021 19:10:01 +0100
Subject: [PATCH 07/16] changes after code review

---
 docs/src/man/comparisons.md     | 10 ++---
 src/abstractdataframe/subset.jl | 78 ++++++++++++++-------------------
 test/grouping.jl                | 24 +++++-----
 3 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md
index 2730b64f61..fabfa5ef36 100644
--- a/docs/src/man/comparisons.md
+++ b/docs/src/man/comparisons.md
@@ -12,6 +12,11 @@ df = DataFrame(grp = repeat(1:2, 3), x = 6:-1:1, y = 4:9, z = [3:7; missing], id
 df2 = DataFrame(grp = [1, 3], w = [10, 11])
 ```
 
+Note that in the comparisons presented below predicates like `x -> x >= 1` can
+be more compactly written as `=>(1)`. The latter form has an additional benefit
+that it is compiled only once per Julia session (as opposed to `x -> x >= 1`
+which defines a new anonymous function every time it is introduced).
+
 ## Comparison with the Python package pandas
 
 The following table compares the main functions of DataFrames.jl with the Python package pandas (version 1.1.0):
@@ -207,11 +212,6 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9,
 | Pick rows                | `filter(df, x >= 1)`           | `subset(df, :x => ByRow(x -> x >= 1))` |
 | Sort rows                | `arrange(df, x)`               | `sort(df, :x)`                         |
 
-Note that in the table above the `x -> x >= 1` predicate can be more compactly
-written as `=>(1)`. The latter form has an additional benefit that it is
-compiled only once per Julia session (as opposed to `x -> x >= 1` which defines
-a new anonymous function every time it is introduced).
-
 As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group:
 
 | Operation                | dplyr                                      | DataFrames.jl                               |
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 3d0e8282d1..8703d8ba10 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -6,6 +6,17 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) =
 _process_subset_pair(i::Int, a) =
     throw(ArgumentError("condition specifier $a is not supported by `subset`"))
 
+_and(x::Bool) = x
+_and(x::Bool, y::Bool...) = x && _and(y...)
+_and(x::Any...) =
+    throw(ArgumentError("set of non-boolean values $x passed in boolean context"))
+
+_and_missing(x::Bool) = x
+_and_missing(x::Bool, y::Union{Bool, Missing}...) = x && _and_missing(y...)
+_and_missing(x::Missing, y::Union{Bool, Missing}...) = false
+_and_missing(x::Any...) =
+    throw(ArgumentError("set of non-boolean values $x passed in boolean context"))
+
 @noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
                                 @nospecialize(args), skipmissing::Bool)
     conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
@@ -18,41 +29,17 @@ _process_subset_pair(i::Int, a) =
         df_conditions = select(df, conditions...,
                                copycols=!(parent(df) isa DataFrame), keepkeys=false)
     end
-    for col in eachcol(df_conditions)
-        if !(eltype(col) <: Union{Missing, Bool})
-            throw(ArgumentError("each transformation must produce a vector whose " *
-                                "eltype is subtype of Union{Missing, Bool}"))
-        end
-    end
 
     @assert ncol(df_conditions) == length(conditions)
 
-    if ncol(df_conditions) == 1
-        cond = df_conditions[!, 1]
-    else
-        cond = .&(eachcol(df_conditions)...)
-    end
-
-    @assert eltype(cond) <: Union{Bool, Missing}
-
     if skipmissing
-        return coalesce.(cond, false)
+        cond = _and_missing.(eachcol(df_conditions)...)
     else
-        # currently the inner condition is only evaluated if actually
-        # we have some missings in cond, but it might change in the future so
-        # I leave this check
-        if Missing <: eltype(cond)
-            pos = findfirst(ismissing, cond)
-            if pos !== nothing
-                # note that the user might have passed a GroupedDataFrame but we
-                # then refer to the parent data frame of this GroupedDataFrame
-                throw(ArgumentError("skipmissing=false but at row $pos the" *
-                                    " passed conditions were evaluated to" *
-                                    " a missing value"))
-            end
-        end
-        return cond
+        cond = _and.(eachcol(df_conditions)...)
     end
+
+    @assert eltype(cond) === Bool
+    return cond
 end
 
 """
@@ -62,26 +49,26 @@ end
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
 which all values produced by transformation(s) `args` for a given row are `true`.
 
-If `skipmissing=false` (the default), an error is thrown if the conjunction (i.e. `&`)
-of results produced by `args` is `missing`. If `skipmissing=true`, `missing`
-is allowed and corresponding rows are dropped. Note that if some transformations
-in `args` return `false` and others `missing` for a given row, the conjunction is `false`,
-but that if some return `true` and others `missing`, the conjunction is `missing`
-([three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic)).
-
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
 described for [`select`](@ref).
 
 Note that as opposed to [`filter`](@ref) the `subset` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows
+columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
 for which condition is false.
 
+If `skipmissing=false` (the default) vectors containing only `Bool` values are
+allowed to be produced by `args`. If `skipmissing=true`, additionally `missing`
+is allowed and it is treated as `false` (corresponding rows are skipped).
+
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 
 If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
+Note that this function will have a large compilation time if more than 32
+conditions are passed as `args`.
+
 See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref),  [`select`](@ref)
 
 # Examples
@@ -129,7 +116,7 @@ julia> subset(df, :x, :z, skipmissing=true)
    1 │     1  true  true   true      1
 
 julia> subset(df, :x, :z)
-ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value
+ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context
 
 julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
 2×5 DataFrame
@@ -160,21 +147,24 @@ end
 Update data frame `df` or the parent of `gdf` in place to contain only rows for
 which all values produced by transformation(s) `args` for a given row is `true`.
 
-If `skipmissing=true`, producing `missing` in conjunction of results produced by
-`args` is allowed and corresponding rows are dropped. If `skipmissing=false`
-(the default) an error is thrown if `missing` is produced.
-
 Each argument passed in `args` can be either a single column selector or a
 `source_columns => function` transformation specifier following the rules
 described for [`select`](@ref).
 
 Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`) and by default drops rows
+columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
 for which contition is false.
 
+If `skipmissing=false` (the default) vectors containing only `Bool` values are
+allowed to be produced by `args`. If `skipmissing=true`, additionally `missing`
+is allowed and it is treated as `false` (corresponding rows are skipped).
+
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
+Note that this function will have a large compilation time if more than 32
+conditions are passed as `args`.
+
 See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
 
 # Examples
@@ -223,7 +213,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false],
    4 │     4  false  missing      4
 
 julia> subset!(df, :x, :z)
-ERROR: ArgumentError: skipmissing=false but at row 3 the passed conditions were evaluated to a missing value
+ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context
 
 julia> subset!(df, :x, :z, skipmissing=true);
 
diff --git a/test/grouping.jl b/test/grouping.jl
index 725ce338ff..e76e353632 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3273,7 +3273,7 @@ end
 end
 
 @testset "subset and subset!" begin
-    refdf = DataFrame(x = repeat([true, false], 4),
+    refdf = DataFrame(x = repeat(Any[true, false], 4),
                       y = repeat([true, false, missing, missing], 2),
                       z = repeat([1, 2, 3, 3], 2),
                       id = 1:8)
@@ -3300,7 +3300,7 @@ end
               filter([:x, :id] => (x, id) -> x && id < 4, df)
         @test_throws ArgumentError subset(df)
         @test isempty(subset(df, :x, :x => ByRow(!)))
-        @test isempty(subset(df, :x => x -> false, :x => x -> missing))
+        @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing)
         @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing)
         @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2)
     end
@@ -3328,7 +3328,7 @@ end
               filter([:x, :id] => (x, id) -> x && id < 4, df)
         @test_throws ArgumentError subset(gdf)
         @test isempty(subset(gdf, :x, :x => ByRow(!)))
-        @test isempty(subset(gdf, :x => x -> false, :x => x -> missing))
+        @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing)
         @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing)
         @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2)
     end
@@ -3359,8 +3359,7 @@ end
     @test isempty(df)
 
     df = copy(refdf)
-    @test isempty(subset!(df, :x => x -> false, :x => x -> missing))
-    df = copy(refdf)
+    @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing)
     @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing)
     @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)
 
@@ -3404,9 +3403,7 @@ end
     @test isempty(df)
     df = copy(refdf)
     gdf = groupby(df, :z)
-    @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
-    df = copy(refdf)
-    gdf = groupby(df, :z)
+    @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
 
@@ -3442,9 +3439,7 @@ end
 
     df = copy(refdf)
     gdf = groupby(df, :z)[[3, 2, 1]]
-    @test isempty(subset!(gdf, :x => x -> false, :x => x -> missing))
-    df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
 
@@ -3462,6 +3457,13 @@ end
     @test subset(df, :x => x -> true) ≅ df
     @test_throws ArgumentError subset(df, :x => x -> (a=x,))
     @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable)
+
+    @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y)
+    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y)
 end
 
 @testset "make sure we handle idx correctly when groups are reordered" begin

From 48cc61800f4869f1026472c782ac6886afb9604a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 Jan 2021 22:45:25 +0100
Subject: [PATCH 08/16] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/subset.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 8703d8ba10..27a0bade68 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -57,9 +57,10 @@ Note that as opposed to [`filter`](@ref) the `subset` function works on whole
 columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
 for which condition is false.
 
-If `skipmissing=false` (the default) vectors containing only `Bool` values are
-allowed to be produced by `args`. If `skipmissing=true`, additionally `missing`
-is allowed and it is treated as `false` (corresponding rows are skipped).
+If `skipmissing=false` (the default) `args` are required to produce vectors containing
+only `Bool` values.
+If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false`
+(i.e. rows for which one of the conditions returns `missing` are skipped).
 
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 

From 1c286395946b9f61564b5a624198d7b288c3f12b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 Jan 2021 23:09:26 +0100
Subject: [PATCH 09/16] corrections after code review

---
 src/abstractdataframe/subset.jl | 58 ++++++++++++++++++++++-----------
 test/grouping.jl                | 12 +++++++
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 27a0bade68..7a035ef91b 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -6,17 +6,42 @@ _process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) =
 _process_subset_pair(i::Int, a) =
     throw(ArgumentError("condition specifier $a is not supported by `subset`"))
 
+_and() = throw(ArgumentError("at least one condition must be passed"))
 _and(x::Bool) = x
 _and(x::Bool, y::Bool...) = x && _and(y...)
-_and(x::Any...) =
-    throw(ArgumentError("set of non-boolean values $x passed in boolean context"))
 
+function _and(x::Any...)
+    loc = findfirst(x -> !(x isa Bool), x)
+    # we know x has positive length and must contain non-boolean
+    @assert !isnothing(loc)
+    xv = x[loc]
+    if ismissing(xv)
+        throw(ArgumentError("missing was returned in condition number $loc " *
+                            "but only true or false are allowed; pass " *
+                            "skipmissing=true to skip missing values"))
+    else
+        throw(ArgumentError("value $xv was returned in condition number $loc " *
+                            "but only true or false are allowed"))
+    end
+end
+
+_and_missing() = throw(ArgumentError("at least one condition must be passed"))
 _and_missing(x::Bool) = x
 _and_missing(x::Bool, y::Union{Bool, Missing}...) = x && _and_missing(y...)
 _and_missing(x::Missing, y::Union{Bool, Missing}...) = false
-_and_missing(x::Any...) =
-    throw(ArgumentError("set of non-boolean values $x passed in boolean context"))
 
+function _and_missing(x::Any...)
+    loc = findfirst(x -> !(x isa Union{Bool, Missing}), x)
+    # we know x has positive length and must contain non-boolean
+    @assert !isnothing(loc)
+    xv = x[loc]
+    throw(ArgumentError("value $xv was returned in condition number $loc" *
+                        "but only true, false, or missing are allowed"))
+end
+
+
+# Note that _get_subset_conditions will have a large compilation time
+# if more than 32 conditions are passed as `args`.
 @noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
                                 @nospecialize(args), skipmissing::Bool)
     conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
@@ -57,19 +82,16 @@ Note that as opposed to [`filter`](@ref) the `subset` function works on whole
 columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
 for which condition is false.
 
-If `skipmissing=false` (the default) `args` are required to produce vectors containing
-only `Bool` values.
-If `skipmissing=true`, additionally `missing` is allowed and it is treated as `false`
-(i.e. rows for which one of the conditions returns `missing` are skipped).
+If `skipmissing=false` (the default) `args` are required to produce vectors
+containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
+allowed and it is treated as `false` (i.e. rows for which one of the conditions
+returns `missing` are skipped).
 
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 
 If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
-Note that this function will have a large compilation time if more than 32
-conditions are passed as `args`.
-
 See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref),  [`select`](@ref)
 
 # Examples
@@ -117,7 +139,7 @@ julia> subset(df, :x, :z, skipmissing=true)
    1 │     1  true  true   true      1
 
 julia> subset(df, :x, :z)
-ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context
+ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values
 
 julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
 2×5 DataFrame
@@ -156,16 +178,14 @@ Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
 columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
 for which contition is false.
 
-If `skipmissing=false` (the default) vectors containing only `Bool` values are
-allowed to be produced by `args`. If `skipmissing=true`, additionally `missing`
-is allowed and it is treated as `false` (corresponding rows are skipped).
+If `skipmissing=false` (the default) `args` are required to produce vectors
+containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
+allowed and it is treated as `false` (i.e. rows for which one of the conditions
+returns `missing` are skipped).
 
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
-Note that this function will have a large compilation time if more than 32
-conditions are passed as `args`.
-
 See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
 
 # Examples
@@ -214,7 +234,7 @@ julia> df = DataFrame(id=1:4, x=[true, false, true, false],
    4 │     4  false  missing      4
 
 julia> subset!(df, :x, :z)
-ERROR: ArgumentError: set of non-boolean values (true, missing) passed in boolean context
+ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values
 
 julia> subset!(df, :x, :z, skipmissing=true);
 
diff --git a/test/grouping.jl b/test/grouping.jl
index e76e353632..d0c5af5397 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3284,10 +3284,15 @@ end
         @test subset(df, :x, view=true) ≅ filter(:x => identity, df)
         @test subset(df, :x, view=true) isa SubDataFrame
         @test_throws ArgumentError subset(df, :y)
+        @test_throws ArgumentError subset(df, :y, :x)
         @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
         @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
         @test subset(df, :x, :y, skipmissing=true) ≅
               filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(df, :y, :x, skipmissing=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
         @test subset(df, :x, :y, skipmissing=true, view=true) ≅
               filter([:x, :y] => (x, y) -> x && y === true, df)
         @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
@@ -3312,10 +3317,15 @@ end
         @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df)
         @test subset(gdf, :x, view=true) isa SubDataFrame
         @test_throws ArgumentError subset(gdf, :y)
+        @test_throws ArgumentError subset(gdf, :y, :x)
         @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
         @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
+        @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df)
         @test subset(gdf, :x, :y, skipmissing=true) ≅
               filter([:x, :y] => (x, y) -> x && y === true, df)
+        @test subset(gdf, :y, :x, skipmissing=true) ≅
+              filter([:x, :y] => (x, y) -> x && y === true, df)
         @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅
               filter([:x, :y] => (x, y) -> x && y === true, df)
         @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅
@@ -3460,9 +3470,11 @@ end
 
     @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y)
     @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x)
     @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y)
     @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y)
     @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y)
+    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x)
     @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y)
 end
 

From 9655dafd7c7eee8eac95128f29c34a9d5d04b967 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 2 Jan 2021 12:28:13 +0100
Subject: [PATCH 10/16] improve test coverage

---
 test/grouping.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/grouping.jl b/test/grouping.jl
index d0c5af5397..e105199794 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3476,6 +3476,14 @@ end
     @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y)
     @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x)
     @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y)
+
+    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true)
+    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true)
+    @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true)
+    @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true)
+
+    @test_throws ArgumentError DataFrames._and()
+    @test_throws ArgumentError DataFrames._and_missing()
 end
 
 @testset "make sure we handle idx correctly when groups are reordered" begin

From 058c06e316375e4a61c3d6eb0dba1f82269ade3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 3 Jan 2021 12:50:50 +0100
Subject: [PATCH 11/16] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/subset.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 7a035ef91b..d149dfa195 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -42,7 +42,7 @@ end
 
 # Note that _get_subset_conditions will have a large compilation time
 # if more than 32 conditions are passed as `args`.
-@noinline function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
+function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
                                 @nospecialize(args), skipmissing::Bool)
     conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)]
 
@@ -92,7 +92,7 @@ If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
-See also: [`subset!`](@ref), [`filter`](@ref), [`filter!`](@ref),  [`select`](@ref)
+See also: [`subset!`](@ref), [`filter`](@ref), [`select`](@ref)
 
 # Examples
 
@@ -186,7 +186,7 @@ returns `missing` are skipped).
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
-See also: [`subset`](@ref), [`filter`](@ref), [`filter!`](@ref)
+See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref)
 
 # Examples
 

From 2ae30ca6db1938e2d58500495ee8812357f2729a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 3 Jan 2021 13:30:35 +0100
Subject: [PATCH 12/16] add ungroup kwarg and fix docstring

---
 src/abstractdataframe/subset.jl | 30 +++++++++++------
 test/grouping.jl                | 60 +++++++++++++++++++++------------
 2 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index d149dfa195..15c807b551 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -69,7 +69,8 @@ end
 
 """
     subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
-    subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false)
+    subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false,
+           ungroup::Bool=true)
 
 Return a copy of data frame `df` or parent of `gdf` containing only rows for
 which all values produced by transformation(s) `args` for a given row are `true`.
@@ -79,8 +80,7 @@ Each argument passed in `args` can be either a single column selector or a
 described for [`select`](@ref).
 
 Note that as opposed to [`filter`](@ref) the `subset` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
-for which condition is false.
+columns (or all rows in groups for `GroupedDataFrame`).
 
 If `skipmissing=false` (the default) `args` are required to produce vectors
 containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
@@ -89,6 +89,9 @@ returns `missing` are skipped).
 
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 
+If `ungroup=false` the return value of the operation on `gdf` is re-grouped
+based on the same grouping columns and `GroupedDataFrame` is returned.
+
 If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
@@ -157,15 +160,18 @@ julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
 end
 
 @inline function subset(gdf::GroupedDataFrame, @nospecialize(args...);
-                        skipmissing::Bool=false, view::Bool=false)
+                        skipmissing::Bool=false, view::Bool=false,
+                        ungroup::Bool=true)
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)
-    return view ? Base.view(df, row_selector, :) : df[row_selector, :]
+    res = view ? Base.view(df, row_selector, :) : df[row_selector, :]
+    return ungroup ? res : groupby(res, groupcols(gdf))
 end
 
 """
     subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false)
-    subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false)
+    subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false,
+            ungroup::Bool=true)
 
 Update data frame `df` or the parent of `gdf` in place to contain only rows for
 which all values produced by transformation(s) `args` for a given row is `true`.
@@ -175,14 +181,16 @@ Each argument passed in `args` can be either a single column selector or a
 described for [`select`](@ref).
 
 Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole
-columns (or all rows in groups for `GroupedDataFrame`) and by default skips rows
-for which contition is false.
+columns (or all rows in groups for `GroupedDataFrame`).
 
 If `skipmissing=false` (the default) `args` are required to produce vectors
 containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
 allowed and it is treated as `false` (i.e. rows for which one of the conditions
 returns `missing` are skipped).
 
+If `ungroup=false` the return value of the operation on `gdf` is re-grouped
+based on the same grouping columns and `GroupedDataFrame` is returned.
+
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).
 
@@ -264,8 +272,10 @@ function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Boo
     return delete!(df, findall(!, row_selector))
 end
 
-function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false)
+function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false,
+                 ungroup::Bool=true)
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)
-    return delete!(df, findall(!, row_selector))
+    res = delete!(df, findall(!, row_selector))
+    return ungroup ? res : groupby(res, groupcols(gdf))
 end
diff --git a/test/grouping.jl b/test/grouping.jl
index e105199794..cfa3ae2a25 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3279,7 +3279,9 @@ end
                       id = 1:8)
 
     for df in (copy(refdf), @view copy(refdf)[1:end-1, :])
+        df2 = copy(df)
         @test subset(df, :x) ≅ filter(:x => identity, df)
+        @test df ≅ df2
         @test subset(df, :x) isa DataFrame
         @test subset(df, :x, view=true) ≅ filter(:x => identity, df)
         @test subset(df, :x, view=true) isa SubDataFrame
@@ -3312,10 +3314,18 @@ end
 
     for df in (copy(refdf), @view copy(refdf)[1:end-1, :]),
         gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]])
+        df2 = copy(df)
         @test subset(gdf, :x) ≅ filter(:x => identity, df)
+        @test df ≅ df2
         @test subset(gdf, :x) isa DataFrame
+        @test subset(gdf, :x, ungroup=false) ≅
+              groupby_checked(filter(:x => identity, df), :z)
+        @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame}
         @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df)
         @test subset(gdf, :x, view=true) isa SubDataFrame
+        @test subset(gdf, :x, view=true, ungroup=false) ≅
+              groupby_checked(filter(:x => identity, df), :z)
+        @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame}
         @test_throws ArgumentError subset(gdf, :y)
         @test_throws ArgumentError subset(gdf, :y, :x)
         @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df)
@@ -3374,81 +3384,89 @@ end
     @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2)
 
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x) === df
+
+    df = copy(refdf)
+    gdf = groupby_checked(df, :z)
+    gdf2 = subset!(gdf, :x, ungroup=false)
+    @test gdf2 isa GroupedDataFrame{DataFrame}
+    @test parent(gdf2) === df
+    @test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z)
+
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test_throws ArgumentError subset!(gdf, :y)
     @test df ≅ refdf
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :y, skipmissing=true) === df
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x, :y, skipmissing=true) === df
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
           filter([:x, :y] => (x, y) -> x && y === true, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
           filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
           filter([:x, :id] => (x, id) -> x && id < 4, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test_throws ArgumentError subset!(gdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test isempty(subset!(gdf, :x, :x => ByRow(!)))
     @test isempty(df)
     df = copy(refdf)
-    gdf = groupby(df, :z)
+    gdf = groupby_checked(df, :z)
     @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)
 
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test_throws ArgumentError subset!(gdf, :y)
     @test df ≅ refdf
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅
           filter([:x, :y] => (x, y) -> x && y === true, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅
           filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅
           filter([:x, :id] => (x, id) -> x && id < 4, refdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test_throws ArgumentError subset!(gdf)
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test isempty(subset!(gdf, :x, :x => ByRow(!)))
     @test isempty(df)
 
     df = copy(refdf)
-    gdf = groupby(df, :z)[[3, 2, 1]]
+    gdf = groupby_checked(df, :z)[[3, 2, 1]]
     @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing)
     @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2)

From 2f7a9192a6be9275eb0074da09304cf52afbc767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 3 Jan 2021 17:31:19 +0100
Subject: [PATCH 13/16] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/subset.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index 15c807b551..be955d6711 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -89,8 +89,8 @@ returns `missing` are skipped).
 
 If `view=true` a `SubDataFrame` view  is returned instead of a `DataFrame`.
 
-If `ungroup=false` the return value of the operation on `gdf` is re-grouped
-based on the same grouping columns and `GroupedDataFrame` is returned.
+If `ungroup=false` the resulting data frame is re-grouped based on the same
+grouping columns as `gdf` and a `GroupedDataFrame` is returned.
 
 If a `GroupedDataFrame` is passed then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).

From 1e481c6f439b34c8644e61cb0d233b52ebc914a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 3 Jan 2021 17:31:52 +0100
Subject: [PATCH 14/16] fix doc

---
 src/abstractdataframe/subset.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index be955d6711..ca8ea52a1a 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -188,8 +188,8 @@ containing only `Bool` values. If `skipmissing=true`, additionally `missing` is
 allowed and it is treated as `false` (i.e. rows for which one of the conditions
 returns `missing` are skipped).
 
-If `ungroup=false` the return value of the operation on `gdf` is re-grouped
-based on the same grouping columns and `GroupedDataFrame` is returned.
+If `ungroup=false` the resulting data frame is re-grouped based on the same
+grouping columns as `gdf` and a `GroupedDataFrame` is returned.
 
 If `GroupedDataFrame` is subsetted then it must include all groups present in the
 `parent` data frame, like in [`select!`](@ref).

From e43afb535a1c1f704841ad32215a3e3435d08694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 3 Jan 2021 18:42:49 +0100
Subject: [PATCH 15/16] add TODO for faster groupby

---
 src/abstractdataframe/subset.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index ca8ea52a1a..b06dfeb80a 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -165,6 +165,7 @@ end
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)
     res = view ? Base.view(df, row_selector, :) : df[row_selector, :]
+    # TODO: in some cases it might be faster to groupby gdf.groups[row_selector]
     return ungroup ? res : groupby(res, groupcols(gdf))
 end
 
@@ -277,5 +278,6 @@ function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Boo
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)
     res = delete!(df, findall(!, row_selector))
+    # TODO: in some cases it might be faster to groupby gdf.groups[row_selector]
     return ungroup ? res : groupby(res, groupcols(gdf))
 end

From 6704541c5db9bc68500f3955841c73382705088c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 9 Jan 2021 23:29:07 +0100
Subject: [PATCH 16/16] remove @inline

---
 src/abstractdataframe/subset.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
index b06dfeb80a..6f0cdfca31 100644
--- a/src/abstractdataframe/subset.jl
+++ b/src/abstractdataframe/subset.jl
@@ -153,14 +153,14 @@ julia> subset(groupby(df, :y), :v => x -> x .> minimum(x))
    2 │     4  false  false  missing     12
 ```
 """
-@inline function subset(df::AbstractDataFrame, @nospecialize(args...);
-                        skipmissing::Bool=false, view::Bool=false)
+function subset(df::AbstractDataFrame, @nospecialize(args...);
+                skipmissing::Bool=false, view::Bool=false)
     row_selector = _get_subset_conditions(df, args, skipmissing)
     return view ? Base.view(df, row_selector, :) : df[row_selector, :]
 end
 
-@inline function subset(gdf::GroupedDataFrame, @nospecialize(args...);
-                        skipmissing::Bool=false, view::Bool=false,
+function subset(gdf::GroupedDataFrame, @nospecialize(args...);
+                skipmissing::Bool=false, view::Bool=false,
                         ungroup::Bool=true)
     row_selector = _get_subset_conditions(gdf, args, skipmissing)
     df = parent(gdf)