Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use refpool optimized method for integer grouping #2610

Merged
merged 7 commits into from
Jan 31, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 72 additions & 12 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,59 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
isequal(cols1[1][r1], cols2[1][r2]) &&
isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2)

# Simple vector type for internal use which represents a virtual DataAPI.refpool
# so that a vector of reals can be its own DataAPI.refarray:
# this just allows telling the generic row_group_slots method for generic refpools
# what is the minimum index and the number of (potential) groups
# missing values are supported and represented used index max+1
struct IntegerRefpool{T<:Union{Real, Missing}} <: AbstractVector{T}
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
min::Int
max::Int
end

Base.size(x::IntegerRefpool{T}) where {T} = (x.max - x.min + 1 + (T >: Missing),)
Base.axes(x::IntegerRefpool{T}) where {T} = (x.min:(x.max + (T >: Missing)),)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
Base.IndexStyle(::Type{<:IntegerRefpool}) = Base.IndexLinear()
@inline function Base.getindex(x::IntegerRefpool{T}, i::Integer) where T
@boundscheck checkbounds(x, i)
if T >: Missing && i == x.max+1
return missing
else
return i - x.min + 1
end
end
Base.allunique(::IntegerRefpool) = true
Base.issorted(::IntegerRefpool) = true

function refpool_and_array(x::AbstractArray)
refpool = DataAPI.refpool(x)
refarray = DataAPI.refarray(x)

if refpool !== nothing
return refpool, refarray
elseif x isa AbstractArray{<:Union{Real, Missing}} &&
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
all(v -> ismissing(v) | isinteger(v), x)
isempty(skipmissing(x)) && return nothing, nothing
minval, maxval = extrema(skipmissing(x))
# Threshold chosen with the same rationale as the row_group_slots refpool method:
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
# refpool approach is faster but we should not allocate too much memory either
if maxval - minval + 1 <= 2 * length(x)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
refpool′ = IntegerRefpool{eltype(x)}(minval, maxval)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
if eltype(x) >: Missing
# Missing values go to last group with code maxval+1
refarray′ = Missings.replace(x, maxval+1)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
else
refarray′ = x
end
return refpool′, refarray′
else
return nothing, nothing
end
else
return nothing, nothing
end
end

# Helper function for RowGroupDict.
# Returns a tuple:
# 1) the highest group index in the `groups` vector
Expand All @@ -103,16 +156,21 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
# 4) whether groups are already sorted
# Optional `groups` vector is set to the group indices of each row (starting at 1)
# With skipmissing=true, rows with missing values are attributed index 0.
row_group_slots(cols::Tuple{Vararg{AbstractVector}},
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} =
row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort)
function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
rpa = refpool_and_array.(cols)
refpools = first.(rpa)
refarrays = last.(rpa)
row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort)
end

# Generic fallback method based on open adressing hash table
function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
refpools::Any,
refpools::Any, # Ignored
refarrays::Any, # Ignored
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
Expand Down Expand Up @@ -163,8 +221,11 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
end

# Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
function row_group_slots(cols::NTuple{N, <:AbstractVector},
refpools::NTuple{N, <:AbstractVector},
function row_group_slots(cols::NTuple{N, AbstractVector},
refpools::NTuple{N, AbstractVector},
refs::NTuple{N, Union{AbstractVector{<:Real},
Missings.EachReplaceMissing{
<:AbstractVector{<:Union{Real, Missing}}}}},
hash::Val{false},
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
Expand All @@ -173,7 +234,6 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
# and this method needs to allocate a groups vector anyway
@assert groups !== nothing && all(col -> length(col) == length(groups), cols)

refs = map(DataAPI.refarray, cols)
missinginds = map(refpools) do refpool
eltype(refpool) >: Missing ?
something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1
Expand Down Expand Up @@ -208,9 +268,9 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
sort ||
anydups ? cols : refs
return invoke(row_group_slots,
Tuple{Tuple{Vararg{AbstractVector}}, Any, Val,
Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
Union{Vector{Int}, Nothing}, Bool, Bool},
newcols, refpools, hash, groups, skipmissing, sort)
newcols, refpools, refs, hash, groups, skipmissing, sort)
end

seen = fill(false, ngroups)
Expand Down
Loading