-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathdata.table.timings.jl
71 lines (60 loc) · 3.17 KB
/
data.table.timings.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Julia benchmarks from R's data.table
# https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping
using Random
using DataFrames, DataFramesMeta
using CategoricalArrays
N=10_0000; K=100
Random.seed!(1)
# Array version
DA = DataFrame(
id1 = rand([Symbol("id", i) for i=1:K], N), # large groups (char)
id2 = rand([Symbol("id", i) for i=1:K], N), # large groups (char)
id3 = rand([Symbol("id", i) for i=1:N÷K], N), # small groups (char)
id4 = rand(1:K, N), # large groups (int)
id5 = rand(1:K, N), # large groups (int)
id6 = rand(1:N÷K, N), # small groups (int)
v1 = rand(1:5, N), # int in range [1,5]
v2 = rand(1:5, N), # int in range [1,5]
v3 = rand(N) # numeric e.g. 23.5749
);
# CategoricalArray version
DCA = DataFrame(
id1 = CategoricalArray(rand([Symbol("id", i) for i=1:K], N)), # large groups (char)
id2 = CategoricalArray(rand([Symbol("id", i) for i=1:K], N)), # large groups (char)
id3 = CategoricalArray(rand([Symbol("id", i) for i=1:N÷K], N)), # small groups (char)
id4 = CategoricalArray(rand(1:K, N)), # large groups (int)
id5 = CategoricalArray(rand(1:K, N)), # large groups (int)
id6 = CategoricalArray(rand(1:N÷K, N)), # small groups (int)
v1 = rand(1:5, N), # int in range [1,5]
v2 = rand(1:5, N), # int in range [1,5]
v3 = rand(N) # numeric e.g. 23.5749
);
# Array{Union{T, Missing}} version
DMA = DataFrame(
id1 = Array{Union{Symbol, Missing}}(rand([Symbol("id", i) for i=1:K], N)), # large groups (char)
id2 = Array{Union{Symbol, Missing}}(rand([Symbol("id", i) for i=1:K], N)), # large groups (char)
id3 = Array{Union{Symbol, Missing}}(rand([Symbol("id", i) for i=1:N÷K], N)), # small groups (char)
id4 = Array{Union{Int, Missing}}(rand(1:K, N)), # large groups (int)
id5 = Array{Union{Int, Missing}}(rand(1:K, N)), # large groups (int)
id6 = Array{Union{Int, Missing}}(rand(1:N÷K, N)), # small groups (int)
v1 = Array{Union{Int, Missing}}(rand(1:5, N)), # int in range [1,5]
v2 = Array{Union{Int, Missing}}(rand(1:5, N)), # int in range [1,5]
v3 = Array{Union{Float64, Missing}}(rand(N)) # numeric e.g. 23.5749
);
function dt_timings(D)
@time @by(D, :id1, :sv =sum(:v1));
@time @by(D, :id1, :sv =sum(:v1));
@time @by(D, [:id1, :id2], :sv =sum(:v1));
@time @by(D, [:id1, :id2], :sv =sum(:v1));
@time @by(D, :id3, :sv = sum(:v1), :mv3 = mean(:v3));
@time @by(D, :id3, :sv = sum(:v1), :mv3 = mean(:v3));
@time aggregate(D[!, [4;7:9]], :id4, mean);
@time aggregate(D[!, [4;7:9]], :id4, mean);
@time aggregate(D[!, [6;7:9]], :id6, sum);
@time aggregate(D[!, [6;7:9]], :id6, sum);
return
end
dt_timings(DA)
dt_timings(DCA)
dt_timings(DMA)
@profile @by(DA, :id1, :sv =sum(:v1));