Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use refpool optimized method for integer grouping #2610

Merged
merged 7 commits into from
Jan 31, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 87 additions & 16 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,68 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
isequal(cols1[1][r1], cols2[1][r2]) &&
isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2)

# Simple vector type for internal use which represents a virtual DataAPI.refpool
# so that a vector of reals can be its own DataAPI.refarray:
# this just allows telling the generic row_group_slots method for generic refpools
# what is the minimum index and the number of (potential) groups
# missing values are represented using index max+1
struct IntegerRefpool{T<:Union{Int, Missing}} <: AbstractVector{T}
min::Int
max::Int
function IntegerRefpool{T}(min::Real, max::Real) where T<:Union{Int, Missing}
@assert max < typemax(Int) - 1
@assert typemin(Int) <= widen(max) - widen(min) + 1 < typemax(Int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small suggestions:

  1. check that min is less of equal than max
  2. then no need to check typemin(Int)
  3. also +2 is needed as we potentially have Missing to handle
Suggested change
@assert max < typemax(Int) - 1
@assert typemin(Int) <= widen(max) - widen(min) + 1 < typemax(Int)
@assert min <= max < typemax(Int) - 1
@assert widen(max) - widen(min) + 2 < typemax(Int)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to refactor this part but I've tried to reuse your suggestion to simplify checks. Let me know what you think.

new{T}(min, max)
end
end

Base.size(x::IntegerRefpool{T}) where {T} = (x.max - x.min + 1 + (T >: Missing),)
Base.axes(x::IntegerRefpool{T}) where {T} = (x.min:(x.max + (T >: Missing)),)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
Base.IndexStyle(::Type{<:IntegerRefpool}) = Base.IndexLinear()
@inline function Base.getindex(x::IntegerRefpool{T}, i::Real) where T
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
@boundscheck checkbounds(x, i)
if T >: Missing && i == x.max + 1
return missing
else
return Int(i - x.min + 1)
end
end
Base.allunique(::IntegerRefpool) = true
Base.issorted(::IntegerRefpool) = true

function refpool_and_array(x::AbstractArray)
refpool = DataAPI.refpool(x)
refarray = DataAPI.refarray(x)

if refpool !== nothing
return refpool, refarray
elseif x isa AbstractArray{<:Union{Real, Missing}} &&
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
all(v -> ismissing(v) | isinteger(v), x)
isempty(skipmissing(x)) && return nothing, nothing
minval, maxval = extrema(skipmissing(x))
# Threshold chosen with the same rationale as the row_group_slots refpool method:
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
# refpool approach is faster but we should not allocate too much memory either
# We also have to avoid overflow
if typemin(Int) <= maxval + 1 < typemax(Int) &&
typemin(Int) <= minval <= typemax(Int) &&
widen(maxval) - widen(minval) + 1 <= 2 * length(x) < typemax(Int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as above you have to use +2 to allow to store missing

Suggested change
widen(maxval) - widen(minval) + 1 <= 2 * length(x) < typemax(Int)
widen(maxval) - widen(minval) + 2 <= 2 * length(x) < typemax(Int)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed. I've changed the code to store this in an ngroups variable to make things clearer.

if eltype(x) >: Missing
refpool′ = IntegerRefpool{Union{Int, Missing}}(minval, maxval)
# Missing values go to last group with code maxval+1
refarray′ = Missings.replace(x, maxval+1)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
else
refpool′ = IntegerRefpool{Int}(minval, maxval)
refarray′ = x
end
return refpool′, refarray′
else
return nothing, nothing
end
else
return nothing, nothing
end
end

# Helper function for RowGroupDict.
# Returns a tuple:
# 1) the highest group index in the `groups` vector
Expand All @@ -103,16 +165,21 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
# 4) whether groups are already sorted
# Optional `groups` vector is set to the group indices of each row (starting at 1)
# With skipmissing=true, rows with missing values are attributed index 0.
row_group_slots(cols::Tuple{Vararg{AbstractVector}},
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} =
row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort)
function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
rpa = refpool_and_array.(cols)
refpools = first.(rpa)
refarrays = last.(rpa)
row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort)
end

# Generic fallback method based on open adressing hash table
function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
refpools::Any,
refpools::Any, # Ignored
refarrays::Any, # Ignored
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
Expand Down Expand Up @@ -163,8 +230,12 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
end

# Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
function row_group_slots(cols::NTuple{N, <:AbstractVector},
refpools::NTuple{N, <:AbstractVector},
function row_group_slots(cols::NTuple{N, AbstractVector},
refpools::NTuple{N, AbstractVector},
refarrays::NTuple{N,
bkamins marked this conversation as resolved.
Show resolved Hide resolved
Union{AbstractVector{<:Real},
Missings.EachReplaceMissing{
<:AbstractVector{<:Union{Real, Missing}}}}},
hash::Val{false},
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
Expand All @@ -173,7 +244,6 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
# and this method needs to allocate a groups vector anyway
@assert groups !== nothing && all(col -> length(col) == length(groups), cols)

refs = map(DataAPI.refarray, cols)
missinginds = map(refpools) do refpool
eltype(refpool) >: Missing ?
something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1
Expand Down Expand Up @@ -205,12 +275,13 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
anydups
# In the simplest case, we can work directly with the reference codes
newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) ||
!(refarrays isa NTuple{<:Any, AbstractVector}) ||
sort ||
anydups ? cols : refs
anydups ? cols : refarrays
return invoke(row_group_slots,
Tuple{Tuple{Vararg{AbstractVector}}, Any, Val,
Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
Union{Vector{Int}, Nothing}, Bool, Bool},
newcols, refpools, hash, groups, skipmissing, sort)
newcols, refpools, refarrays, hash, groups, skipmissing, sort)
end

seen = fill(false, ngroups)
Expand Down Expand Up @@ -253,7 +324,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
@inbounds for i in eachindex(groups)
local refs_i
let i=i # Workaround for julia#15276
refs_i = map(c -> c[i], refs)
refs_i = map(c -> c[i], refarrays)
end
vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds)
j = sum(vals) + 1
Expand All @@ -269,7 +340,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
@inbounds for i in eachindex(groups)
local refs_i
let i=i # Workaround for julia#15276
refs_i = map(refs, missinginds) do ref, missingind
refs_i = map(refarrays, missinginds) do ref, missingind
r = Int(ref[i])
if skipmissing
return r == missingind ? -1 : (r > missingind ? r-1 : r)
Expand Down Expand Up @@ -322,7 +393,7 @@ function compute_indices(groups::AbstractVector{<:Integer}, ngroups::Integer)

# group start positions in a sorted table
starts = Vector{Int}(undef, ngroups+1)
if length(starts) > 1
if length(starts) > 0
starts[1] = 1
@inbounds for i in 1:ngroups
starts[i+1] = starts[i] + stops[i]
Expand Down
9 changes: 4 additions & 5 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,14 @@ const ≅ = isequal
d3 = randn(N)
d4 = randn(N)
df7 = DataFrame([d1, d2, d3], [:d1, :d2, :d3])
ref_d1 = unique(d1)

#test_group("groupby")
gd = groupby(df7, :d1)
@test length(gd) == 2
@test gd[1][:, :d2] ≅ d2[d1 .== ref_d1[1]]
@test gd[2][:, :d2] ≅ d2[d1 .== ref_d1[2]]
@test gd[1][:, :d3] == d3[d1 .== ref_d1[1]]
@test gd[2][:, :d3] == d3[d1 .== ref_d1[2]]
@test gd[1][:, :d2] ≅ d2[d1 .== 1]
@test gd[2][:, :d2] ≅ d2[d1 .== 2]
@test gd[1][:, :d3] == d3[d1 .== 1]
@test gd[2][:, :d3] == d3[d1 .== 2]

g1 = groupby(df7, [:d1, :d2])
g2 = groupby(df7, [:d2, :d1])
Expand Down
Loading