Add faster semi and antijoin (#2641)

JuliaData · Mar 8, 2021 · 6d5ae35 · 6d5ae35
1 parent c8f1281
commit 6d5ae35
Show file tree

Hide file tree

Showing 7 changed files with 509 additions and 135 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -33,12 +33,13 @@
 
 ## Other relevant changes
 
-* `innerjoin`, `leftjoin`, `rightjoin`, and `outerjoin` are now much faster and
-  check if passed data frames are sorted by the `on` columns and takes into account
-  if shorter data frame that is joined has unique values in `on` columns.
-  These aspects of input data frames might affect the order of rows produced in the output
+* `innerjoin`, `leftjoin`, `rightjoin`, `outerjoin`, `semijoin`, and `antijoin`
+  are now much faster and check if passed data frames are sorted by the `on`
+  columns and take into account if shorter data frame that is joined has unique
+  values in `on` columns. These aspects of input data frames might affect the
+  order of rows produced in the output
   ([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612),
-   [#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])
+  [#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])
 
 # DataFrames v0.22 Release Notes
 

diff --git a/Project.toml b/Project.toml
@@ -25,7 +25,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [compat]
 CategoricalArrays = "0.9.3"
 Compat = "3.17"
-DataAPI = "1.4"
+DataAPI = "1.6"
 InvertedIndices = "1"
 IteratorInterfaceExtensions = "0.1.1, 1"
 Missings = "0.4.2"

diff --git a/benchmarks/join_performance.jl b/benchmarks/join_performance.jl
@@ -10,7 +10,7 @@ fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true))
 @assert ARGS[4] in ["uniq", "dup", "manydup"]
 @assert ARGS[5] in ["sort", "rand"]
 @assert ARGS[6] in ["1", "2"]
-@assert ARGS[7] in ["inner", "left", "right", "outer"]
+@assert ARGS[7] in ["inner", "left", "right", "outer", "semi", "anti"]
 
 @info ARGS
 
@@ -76,7 +76,8 @@ else
 end
 
 const joinfun = Dict("inner" => innerjoin, "left" => leftjoin,
-                     "right" => rightjoin, "outer" => outerjoin)[ARGS[7]]
+                     "right" => rightjoin, "outer" => outerjoin,
+                     "semi" => semijoin, "anti" => antijoin)[ARGS[7]]
 
 if ARGS[6] == "1"
     df1 = DataFrame(id1 = col1)

diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -6,3 +6,7 @@ julia runtests.jl 100000 50000000 right
 julia runtests.jl 5000000 10000000 right
 julia runtests.jl 100000 50000000 outer
 julia runtests.jl 5000000 10000000 outer
+julia runtests.jl 100000 50000000 semi
+julia runtests.jl 5000000 10000000 semi
+julia runtests.jl 100000 50000000 anti
+julia runtests.jl 5000000 10000000 anti
diff --git a/src/join/composer.jl b/src/join/composer.jl
@@ -339,33 +339,9 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
         joined, src_indicator =
             compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator)
     elseif kind == :semi
-        # hash the right rows
-        dfr_on_grp = group_rows(joiner.dfr_on)
-        # iterate over left rows and leave those found in right
-        left_ixs = Vector{Int}()
-        sizehint!(left_ixs, nrow(joiner.dfl))
-        dfr_on_grp_cols = ntuple(i -> dfr_on_grp.df[!, i], ncol(dfr_on_grp.df))
-        dfl_on_cols = ntuple(i -> joiner.dfl_on[!, i], ncol(joiner.dfl_on))
-        @inbounds for l_ix in 1:nrow(joiner.dfl_on)
-            if findrow(dfr_on_grp, joiner.dfl_on, dfr_on_grp_cols, dfl_on_cols, l_ix) != 0
-                push!(left_ixs, l_ix)
-            end
-        end
-        joined = joiner.dfl[left_ixs, :]
+        joined = joiner.dfl[find_semi_rows(joiner), :]
     elseif kind == :anti
-        # hash the right rows
-        dfr_on_grp = group_rows(joiner.dfr_on)
-        # iterate over left rows and leave those not found in right
-        leftonly_ixs = Vector{Int}()
-        sizehint!(leftonly_ixs, nrow(joiner.dfl))
-        dfr_on_grp_cols = ntuple(i -> dfr_on_grp.df[!, i], ncol(dfr_on_grp.df))
-        dfl_on_cols = ntuple(i -> joiner.dfl_on[!, i], ncol(joiner.dfl_on))
-        @inbounds for l_ix in 1:nrow(joiner.dfl_on)
-            if findrow(dfr_on_grp, joiner.dfl_on, dfr_on_grp_cols, dfl_on_cols, l_ix) == 0
-                push!(leftonly_ixs, l_ix)
-            end
-        end
-        joined = joiner.dfl[leftonly_ixs, :]
+        joined = joiner.dfl[.!find_semi_rows(joiner), :]
     else
         throw(ArgumentError("Unknown kind of join requested: $kind"))
     end