Update subset to handle large number of selectors better (#2989)

JuliaData · Jan 23, 2022 · c9a41b0 · c9a41b0 · bkamins · Jan 23, 2022
1 parent 5233159
commit c9a41b0
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 8 deletions.
diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl
@@ -12,8 +12,23 @@ function _and(x::Any...)
                             "but only true or false are allowed; pass " *
                             "skipmissing=true to skip missing values"))
     else
-        throw(ArgumentError("value $xv was returned in condition number $loc " *
-                            "but only true or false are allowed"))
+        throw(ArgumentError("value $xv was returned in condition number $loc" *
+                            " but only true or false are allowed"))
+    end
+end
+
+_and_long(x::Bool, y::Bool) = x && y
+
+function _and_long(x, y)
+    v = x isa Bool ? y : x
+    @assert !(v isa Bool)
+    if ismissing(v)
+        throw(ArgumentError("missing was returned " *
+                            "but only true or false are allowed; pass " *
+                            "skipmissing=true to skip missing values"))
+    else
+        throw(ArgumentError("value $v was returned" *
+                            " but only true or false are allowed"))
     end
 end
 
@@ -27,7 +42,18 @@ function _and_missing(x::Any...)
     # we know x has positive length and must contain non-boolean
     @assert !isnothing(loc)
     xv = x[loc]
-    throw(ArgumentError("value $xv was returned in condition number $loc" *
+    throw(ArgumentError("value $xv was returned in condition number $loc " *
+                        "but only true, false, or missing are allowed"))
+end
+
+_and_long_missing(x::Bool, y::Bool) = x && y
+_and_long_missing(x::Bool, y::Missing) = false
+_and_long_missing(x::Missing, y::Union{Bool, Missing}) = false
+
+function _and_long_missing(x, y)
+    v = x isa Union{Missing, Bool} ? y : x
+    @assert !(v isa Union{Missing, Bool})
+    throw(ArgumentError("value $v was returned " *
                         "but only true, false, or missing are allowed"))
 end
 
@@ -45,8 +71,6 @@ function assert_bool_vec(@nospecialize(fun))
     end
 end
 
-# Note that _get_subset_conditions will have a large compilation time
-# if more than 32 conditions are passed as `args`.
 function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
                                 (args,)::Ref{Any}, skipmissing::Bool)
     cs_vec = []
@@ -79,10 +103,26 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
 
     @assert ncol(df_conditions) == length(conditions)
 
-    if skipmissing
-        cond = _and_missing.(eachcol(df_conditions)...)
+    cols = eachcol(df_conditions)
+    # with many columns, process each column sequentially to avoid large compilation time
+    if length(conditions) > 16
+        if skipmissing
+            cond = _and_long_missing.(cols[1], cols[2])
+            for i in 3:length(conditions)
+                cond .= _and_long_missing.(cond, cols[i])
+            end
+        else
+            cond = _and_long.(cols[1], cols[2])
+            for i in 3:length(conditions)
+                cond .= _and_long.(cond, cols[i])
+            end
+        end
     else
-        cond = _and.(eachcol(df_conditions)...)
+        if skipmissing
+            cond = _and_missing.(cols...)
+        else
+            cond = _and.(cols...)
+        end
     end
 
     @assert eltype(cond) === Bool

diff --git a/test/subset.jl b/test/subset.jl
@@ -307,4 +307,56 @@ end
     end
 end
 
+@testset "wide subsetting" begin
+    # current threshold is at 16
+    for i in 1:20
+        df = DataFrame(a=1:10)
+        @test subset(df, [:a => ByRow(x -> true) for _ in 1:i]) == df
+        @test subset(df, [:a => ByRow(x -> true) for _ in 1:i], :a => ByRow(x -> false)) ==
+              DataFrame(a=Int[])
+        @test_throws ArgumentError subset(df, [:a => ByRow(x -> true) for _ in 1:i],
+                                          :a => ByRow(x -> "a"))
+        @test subset(df, [:a => ByRow(x -> true) for _ in 1:i], skipmissing=true) == df
+        @test subset(df, [:a => ByRow(x -> true) for _ in 1:i],
+                     :a => ByRow(x -> false), skipmissing=true) ==
+              DataFrame(a=Int[])
+        @test subset(df, [:a => ByRow(x -> true) for _ in 1:i],
+                     :a => ByRow(x -> missing), skipmissing=true) ==
+              DataFrame(a=Int[])
+        @test subset(df, :a => ByRow(x -> missing), [:a => ByRow(x -> true) for _ in 1:i],
+                     skipmissing=true) ==
+              DataFrame(a=Int[])
+        @test_throws ArgumentError subset(df, :a => ByRow(x -> missing),
+                                          [:a => ByRow(x -> true) for _ in 1:i],
+                                          :a => ByRow(x -> "a"), skipmissing=true)
+        @test_throws ArgumentError subset(df, [:a => ByRow(x -> true) for _ in 1:i],
+                                          :a => ByRow(x -> missing), :a => ByRow(x -> "a"),
+                                          skipmissing=true)
+
+        # randomized correctness tests across two selection options
+        Random.seed!(1234)
+        mat1 = rand(Bool, 10_000, 5)
+        df = DataFrame(mat1, string.("y", 1:5))
+        df.id = 1:nrow(df)
+        df2 = [DataFrame(trues(10_000, i), :auto) df]
+        @test subset(df, Not(:id)).id ==
+              subset(df2, Not(:id)).id ==
+              df.id[all.(eachrow(mat1))]
+
+        mat2 = rand([true, false, missing], 10_000, 5)
+        df = DataFrame(mat2, string.("y", 1:5))
+        df.id = 1:nrow(df)
+        df2 = [DataFrame(trues(10_000, i), :auto) df]
+        @test subset(df, Not(:id), skipmissing=true).id ==
+              subset(df2, Not(:id), skipmissing=true).id ==
+              df.id[all.(isequal(true), eachrow(mat2))]
+        # with so many rows we must see missing somewhere
+        # so this condition is true in practice
+        if any(ismissing, mat2)
+            @test_throws ArgumentError subset(df, Not(:id))
+            @test_throws ArgumentError subset(df2, Not(:id))
+        end
+    end
+end
+
 end