Skip to content

Commit

Permalink
Update subset to handle large number of selectors better (#2989)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jan 23, 2022
1 parent 5233159 commit c9a41b0
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 8 deletions.
56 changes: 48 additions & 8 deletions src/abstractdataframe/subset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,23 @@ function _and(x::Any...)
"but only true or false are allowed; pass " *
"skipmissing=true to skip missing values"))
else
throw(ArgumentError("value $xv was returned in condition number $loc " *
"but only true or false are allowed"))
throw(ArgumentError("value $xv was returned in condition number $loc" *
" but only true or false are allowed"))
end
end

_and_long(x::Bool, y::Bool) = x && y

function _and_long(x, y)
v = x isa Bool ? y : x
@assert !(v isa Bool)
if ismissing(v)
throw(ArgumentError("missing was returned " *
"but only true or false are allowed; pass " *
"skipmissing=true to skip missing values"))
else
throw(ArgumentError("value $v was returned" *
" but only true or false are allowed"))
end
end

Expand All @@ -27,7 +42,18 @@ function _and_missing(x::Any...)
# we know x has positive length and must contain non-boolean
@assert !isnothing(loc)
xv = x[loc]
throw(ArgumentError("value $xv was returned in condition number $loc" *
throw(ArgumentError("value $xv was returned in condition number $loc " *
"but only true, false, or missing are allowed"))
end

_and_long_missing(x::Bool, y::Bool) = x && y
_and_long_missing(x::Bool, y::Missing) = false
_and_long_missing(x::Missing, y::Union{Bool, Missing}) = false

function _and_long_missing(x, y)
v = x isa Union{Missing, Bool} ? y : x
@assert !(v isa Union{Missing, Bool})
throw(ArgumentError("value $v was returned " *
"but only true, false, or missing are allowed"))
end

Expand All @@ -45,8 +71,6 @@ function assert_bool_vec(@nospecialize(fun))
end
end

# Note that _get_subset_conditions will have a large compilation time
# if more than 32 conditions are passed as `args`.
function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},
(args,)::Ref{Any}, skipmissing::Bool)
cs_vec = []
Expand Down Expand Up @@ -79,10 +103,26 @@ function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame},

@assert ncol(df_conditions) == length(conditions)

if skipmissing
cond = _and_missing.(eachcol(df_conditions)...)
cols = eachcol(df_conditions)
# with many columns, process each column sequentially to avoid large compilation time
if length(conditions) > 16
if skipmissing
cond = _and_long_missing.(cols[1], cols[2])
for i in 3:length(conditions)
cond .= _and_long_missing.(cond, cols[i])
end
else
cond = _and_long.(cols[1], cols[2])
for i in 3:length(conditions)
cond .= _and_long.(cond, cols[i])
end
end
else
cond = _and.(eachcol(df_conditions)...)
if skipmissing
cond = _and_missing.(cols...)
else
cond = _and.(cols...)
end
end

@assert eltype(cond) === Bool
Expand Down
52 changes: 52 additions & 0 deletions test/subset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,4 +307,56 @@ end
end
end

@testset "wide subsetting" begin
# current threshold is at 16
for i in 1:20
df = DataFrame(a=1:10)
@test subset(df, [:a => ByRow(x -> true) for _ in 1:i]) == df
@test subset(df, [:a => ByRow(x -> true) for _ in 1:i], :a => ByRow(x -> false)) ==
DataFrame(a=Int[])
@test_throws ArgumentError subset(df, [:a => ByRow(x -> true) for _ in 1:i],
:a => ByRow(x -> "a"))
@test subset(df, [:a => ByRow(x -> true) for _ in 1:i], skipmissing=true) == df
@test subset(df, [:a => ByRow(x -> true) for _ in 1:i],
:a => ByRow(x -> false), skipmissing=true) ==
DataFrame(a=Int[])
@test subset(df, [:a => ByRow(x -> true) for _ in 1:i],
:a => ByRow(x -> missing), skipmissing=true) ==
DataFrame(a=Int[])
@test subset(df, :a => ByRow(x -> missing), [:a => ByRow(x -> true) for _ in 1:i],
skipmissing=true) ==
DataFrame(a=Int[])
@test_throws ArgumentError subset(df, :a => ByRow(x -> missing),
[:a => ByRow(x -> true) for _ in 1:i],
:a => ByRow(x -> "a"), skipmissing=true)
@test_throws ArgumentError subset(df, [:a => ByRow(x -> true) for _ in 1:i],
:a => ByRow(x -> missing), :a => ByRow(x -> "a"),
skipmissing=true)

# randomized correctness tests across two selection options
Random.seed!(1234)
mat1 = rand(Bool, 10_000, 5)
df = DataFrame(mat1, string.("y", 1:5))
df.id = 1:nrow(df)
df2 = [DataFrame(trues(10_000, i), :auto) df]
@test subset(df, Not(:id)).id ==
subset(df2, Not(:id)).id ==
df.id[all.(eachrow(mat1))]

mat2 = rand([true, false, missing], 10_000, 5)
df = DataFrame(mat2, string.("y", 1:5))
df.id = 1:nrow(df)
df2 = [DataFrame(trues(10_000, i), :auto) df]
@test subset(df, Not(:id), skipmissing=true).id ==
subset(df2, Not(:id), skipmissing=true).id ==
df.id[all.(isequal(true), eachrow(mat2))]
# with so many rows we must see missing somewhere
# so this condition is true in practice
if any(ismissing, mat2)
@test_throws ArgumentError subset(df, Not(:id))
@test_throws ArgumentError subset(df2, Not(:id))
end
end
end

end

2 comments on commit c9a41b0

@bkamins
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/53044

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.2 -m "<description of version>" c9a41b062ce92380bfe0595f26367a5a26505ec5
git push origin v1.3.2

Please sign in to comment.