Skip to content

Commit

Permalink
Add faster semi and antijoin (#2641)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Mar 8, 2021
1 parent c8f1281 commit 6d5ae35
Show file tree
Hide file tree
Showing 7 changed files with 509 additions and 135 deletions.
11 changes: 6 additions & 5 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@

## Other relevant changes

* `innerjoin`, `leftjoin`, `rightjoin`, and `outerjoin` are now much faster and
check if passed data frames are sorted by the `on` columns and takes into account
if shorter data frame that is joined has unique values in `on` columns.
These aspects of input data frames might affect the order of rows produced in the output
* `innerjoin`, `leftjoin`, `rightjoin`, `outerjoin`, `semijoin`, and `antijoin`
are now much faster and check if passed data frames are sorted by the `on`
columns and take into account if shorter data frame that is joined has unique
values in `on` columns. These aspects of input data frames might affect the
order of rows produced in the output
([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612),
[#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])
[#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])

# DataFrames v0.22 Release Notes

Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
CategoricalArrays = "0.9.3"
Compat = "3.17"
DataAPI = "1.4"
DataAPI = "1.6"
InvertedIndices = "1"
IteratorInterfaceExtensions = "0.1.1, 1"
Missings = "0.4.2"
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/join_performance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true))
@assert ARGS[4] in ["uniq", "dup", "manydup"]
@assert ARGS[5] in ["sort", "rand"]
@assert ARGS[6] in ["1", "2"]
@assert ARGS[7] in ["inner", "left", "right", "outer"]
@assert ARGS[7] in ["inner", "left", "right", "outer", "semi", "anti"]

@info ARGS

Expand Down Expand Up @@ -76,7 +76,8 @@ else
end

const joinfun = Dict("inner" => innerjoin, "left" => leftjoin,
"right" => rightjoin, "outer" => outerjoin)[ARGS[7]]
"right" => rightjoin, "outer" => outerjoin,
"semi" => semijoin, "anti" => antijoin)[ARGS[7]]

if ARGS[6] == "1"
df1 = DataFrame(id1 = col1)
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ julia runtests.jl 100000 50000000 right
julia runtests.jl 5000000 10000000 right
julia runtests.jl 100000 50000000 outer
julia runtests.jl 5000000 10000000 outer
julia runtests.jl 100000 50000000 semi
julia runtests.jl 5000000 10000000 semi
julia runtests.jl 100000 50000000 anti
julia runtests.jl 5000000 10000000 anti
28 changes: 2 additions & 26 deletions src/join/composer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -339,33 +339,9 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
joined, src_indicator =
compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator)
elseif kind == :semi
# hash the right rows
dfr_on_grp = group_rows(joiner.dfr_on)
# iterate over left rows and leave those found in right
left_ixs = Vector{Int}()
sizehint!(left_ixs, nrow(joiner.dfl))
dfr_on_grp_cols = ntuple(i -> dfr_on_grp.df[!, i], ncol(dfr_on_grp.df))
dfl_on_cols = ntuple(i -> joiner.dfl_on[!, i], ncol(joiner.dfl_on))
@inbounds for l_ix in 1:nrow(joiner.dfl_on)
if findrow(dfr_on_grp, joiner.dfl_on, dfr_on_grp_cols, dfl_on_cols, l_ix) != 0
push!(left_ixs, l_ix)
end
end
joined = joiner.dfl[left_ixs, :]
joined = joiner.dfl[find_semi_rows(joiner), :]
elseif kind == :anti
# hash the right rows
dfr_on_grp = group_rows(joiner.dfr_on)
# iterate over left rows and leave those not found in right
leftonly_ixs = Vector{Int}()
sizehint!(leftonly_ixs, nrow(joiner.dfl))
dfr_on_grp_cols = ntuple(i -> dfr_on_grp.df[!, i], ncol(dfr_on_grp.df))
dfl_on_cols = ntuple(i -> joiner.dfl_on[!, i], ncol(joiner.dfl_on))
@inbounds for l_ix in 1:nrow(joiner.dfl_on)
if findrow(dfr_on_grp, joiner.dfl_on, dfr_on_grp_cols, dfl_on_cols, l_ix) == 0
push!(leftonly_ixs, l_ix)
end
end
joined = joiner.dfl[leftonly_ixs, :]
joined = joiner.dfl[.!find_semi_rows(joiner), :]
else
throw(ArgumentError("Unknown kind of join requested: $kind"))
end
Expand Down
Loading

0 comments on commit 6d5ae35

Please sign in to comment.