Skip to content

Commit

Permalink
implement faster innerjoin (#2612)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Feb 13, 2021
1 parent ecfc733 commit 726b4e4
Show file tree
Hide file tree
Showing 8 changed files with 887 additions and 17 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@

## Other relevant changes

* `innerjoin` is now much faster and checks if passed data frames are sorted
by the `on` columns and takes into account if shorter data frame that is joined
has unique values in `on` columns. These aspects of input data frames might affect
the order of rows produced in the output
([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612))

# DataFrames v0.22 Release Notes

Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ DataAPI = "1.4"
InvertedIndices = "1"
IteratorInterfaceExtensions = "0.1.1, 1"
Missings = "0.4.2"
PooledArrays = "0.5, 1.0"
PooledArrays = "1.1"
PrettyTables = "0.11"
Reexport = "0.1, 0.2, 1.0"
SortingAlgorithms = "0.1, 0.2, 0.3"
Expand Down
96 changes: 96 additions & 0 deletions benchmarks/innerjoin_performance.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
using CategoricalArrays
using DataFrames
using PooledArrays
using Random

fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true))

@assert length(ARGS) == 6
@assert ARGS[3] in ["int", "pool", "cat", "str"]
@assert ARGS[4] in ["uniq", "dup", "manydup"]
@assert ARGS[5] in ["sort", "rand"]
@assert ARGS[6] in ["1", "2"]

@info ARGS

llen = parse(Int, ARGS[1])
rlen = parse(Int, ARGS[2])
@assert llen > 1000
@assert rlen > 2000

pad = maximum(length.(string.((llen, rlen))))

if ARGS[3] == "int"
if ARGS[4] == "uniq"
col1 = [1:llen;]
col2 = [1:rlen;]
elseif ARGS[4] == "dup"
col1 = repeat(1:llen ÷ 2, inner=2)
col2 = repeat(1:rlen ÷ 2, inner=2)
else
@assert ARGS[4] == "manydup"
col1 = repeat(1:llen ÷ 20, inner=20)
col2 = repeat(1:rlen ÷ 20, inner=20)
end
elseif ARGS[3] == "pool"
if ARGS[4] == "dup"
col1 = PooledArray(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
col2 = PooledArray(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
else
@assert ARGS[4] == "manydup"
col1 = PooledArray(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
col2 = PooledArray(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
end
elseif ARGS[3] == "cat"
if ARGS[4] == "dup"
col1 = categorical(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
col2 = categorical(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
else
@assert ARGS[4] == "manydup"
col1 = categorical(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
col2 = categorical(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
end
else
@assert ARGS[3] == "str"
if ARGS[4] == "uniq"
col1 = string.(1:llen, pad=pad)
col2 = string.(1:rlen, pad=pad)
elseif ARGS[4] == "dup"
col1 = repeat(string.(1:llen ÷ 2, pad=pad), inner=2)
col2 = repeat(string.(1:rlen ÷ 2, pad=pad), inner=2)
else
@assert ARGS[4] == "manydup"
col1 = repeat(string.(1:llen ÷ 20, pad=pad), inner=20)
col2 = repeat(string.(1:rlen ÷ 20, pad=pad), inner=20)
end
end

Random.seed!(1234)

if ARGS[5] == "rand"
shuffle!(col1)
shuffle!(col2)
else
@assert ARGS[5] == "sort"
end

if ARGS[6] == "1"
df1 = DataFrame(id1 = col1)
df2 = DataFrame(id1 = col2)
innerjoin(df1[1:1000, :], df2[1:2000, :], on=:id1)
innerjoin(df2[1:2000, :], df1[1:1000, :], on=:id1)
fullgc()
@time innerjoin(df1, df2, on=:id1)
fullgc()
@time innerjoin(df2, df1, on=:id1)
else
@assert ARGS[6] == "2"
df1 = DataFrame(id1 = col1, id2 = col1)
df2 = DataFrame(id1 = col1, id2 = col1)
innerjoin(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2])
innerjoin(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2])
fullgc()
@time innerjoin(df1, df2, on=[:id1, :id2])
fullgc()
@time innerjoin(df2, df1, on=[:id1, :id2])
end
2 changes: 2 additions & 0 deletions benchmarks/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
julia runtests.jl 100000 50000000
julia runtests.jl 5000000 10000000
12 changes: 12 additions & 0 deletions benchmarks/runtests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
@assert length(ARGS) == 2
file_loc = joinpath(dirname(@__FILE__), "innerjoin_performance.jl")
llen = ARGS[1]
rlen = ARGS[2]

for a3 in ["str", "int", "pool", "cat"],
a4 in ["uniq", "dup", "manydup"],
a5 in ["sort", "rand"],
a6 in ["1", "2"]
a4 == "uniq" && a3 in ["pool", "cat"] && continue
run(`julia $file_loc $llen $rlen $a3 $a4 $a5 $a6`)
end
Loading

0 comments on commit 726b4e4

Please sign in to comment.