JuliaData · nalimilan · Jan 31, 2021 · Jan 23, 2021 · Jan 23, 2021 · Jan 24, 2021
diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -94,6 +94,68 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
     isequal(cols1[1][r1], cols2[1][r2]) &&
         isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2)
 
+# Simple vector type for internal use which represents a virtual DataAPI.refpool
+# so that a vector of reals can be its own DataAPI.refarray:
+# this just allows telling the generic row_group_slots method for generic refpools
+# what is the minimum index and the number of (potential) groups
+# missing values are represented using index max+1
+struct IntegerRefpool{T<:Union{Int, Missing}} <: AbstractVector{T}
+    min::Int
+    max::Int
+    function IntegerRefpool{T}(min::Real, max::Real) where T<:Union{Int, Missing}
+        @assert max < typemax(Int) - 1
+        @assert typemin(Int) <= widen(max) - widen(min) + 1 < typemax(Int)
-        @assert max < typemax(Int) - 1
-        @assert typemin(Int) <= widen(max) - widen(min) + 1 < typemax(Int)
+        @assert min <= max < typemax(Int) - 1
+        @assert widen(max) - widen(min) + 2 < typemax(Int)
-        @assert max < typemax(Int) - 1
-        @assert typemin(Int) <= widen(max) - widen(min) + 1 < typemax(Int)
+        @assert min <= max < typemax(Int) - 1
+        @assert widen(max) - widen(min) + 2 < typemax(Int)
+        new{T}(min, max)
+    end
+end
+
+Base.size(x::IntegerRefpool{T}) where {T} = (x.max - x.min + 1 + (T >: Missing),)
+Base.axes(x::IntegerRefpool{T}) where {T} = (x.min:(x.max + (T >: Missing)),)
+Base.IndexStyle(::Type{<:IntegerRefpool}) = Base.IndexLinear()
+@inline function Base.getindex(x::IntegerRefpool{T}, i::Real) where T
+    @boundscheck checkbounds(x, i)
+    if T >: Missing && i == x.max + 1
+        return missing
+    else
+        return Int(i - x.min + 1)
+    end
+end
+Base.allunique(::IntegerRefpool) = true
+Base.issorted(::IntegerRefpool) = true
+
+function refpool_and_array(x::AbstractArray)
+    refpool = DataAPI.refpool(x)
+    refarray = DataAPI.refarray(x)
+
+    if refpool !== nothing
+        return refpool, refarray
+    elseif x isa AbstractArray{<:Union{Real, Missing}} &&
+        all(v -> ismissing(v) | isinteger(v), x)
+        isempty(skipmissing(x)) && return nothing, nothing
+        minval, maxval = extrema(skipmissing(x))
+        # Threshold chosen with the same rationale as the row_group_slots refpool method:
+        # refpool approach is faster but we should not allocate too much memory either
+        # We also have to avoid overflow
+        if typemin(Int) <= maxval + 1 < typemax(Int) &&
+            typemin(Int) <= minval <= typemax(Int) &&
+            widen(maxval) - widen(minval) + 1 <= 2 * length(x) < typemax(Int)
-            widen(maxval) - widen(minval) + 1 <= 2 * length(x) < typemax(Int)
+            widen(maxval) - widen(minval) + 2 <= 2 * length(x) < typemax(Int)
-            widen(maxval) - widen(minval) + 1 <= 2 * length(x) < typemax(Int)
+            widen(maxval) - widen(minval) + 2 <= 2 * length(x) < typemax(Int)
+            if eltype(x) >: Missing
+                refpool′ = IntegerRefpool{Union{Int, Missing}}(minval, maxval)
+                # Missing values go to last group with code maxval+1
+                refarray′ = Missings.replace(x, maxval+1)
+            else
+                refpool′ = IntegerRefpool{Int}(minval, maxval)
+                refarray′ = x
+            end
+            return refpool′, refarray′
+        else
+            return nothing, nothing
+        end
+    else
+        return nothing, nothing
+    end
+end
+
 # Helper function for RowGroupDict.
 # Returns a tuple:
 # 1) the highest group index in the `groups` vector
@@ -103,16 +165,21 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
 # 4) whether groups are already sorted
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.
-row_group_slots(cols::Tuple{Vararg{AbstractVector}},
-                hash::Val = Val(true),
-                groups::Union{Vector{Int}, Nothing} = nothing,
-                skipmissing::Bool = false,
-                sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} =
-    row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort)
+function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
+                         hash::Val = Val(true),
+                         groups::Union{Vector{Int}, Nothing} = nothing,
+                         skipmissing::Bool = false,
+                         sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+    rpa = refpool_and_array.(cols)
+    refpools = first.(rpa)
+    refarrays = last.(rpa)
+    row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort)
+end
 
 # Generic fallback method based on open adressing hash table
 function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
-                         refpools::Any,
+                         refpools::Any,  # Ignored
+                         refarrays::Any, # Ignored
                          hash::Val = Val(true),
                          groups::Union{Vector{Int}, Nothing} = nothing,
                          skipmissing::Bool = false,
@@ -163,8 +230,12 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
 end
 
 # Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
-function row_group_slots(cols::NTuple{N, <:AbstractVector},
-                         refpools::NTuple{N, <:AbstractVector},
+function row_group_slots(cols::NTuple{N, AbstractVector},
+                         refpools::NTuple{N, AbstractVector},
+                         refarrays::NTuple{N,
+                             Union{AbstractVector{<:Real},
+                                   Missings.EachReplaceMissing{
+                                       <:AbstractVector{<:Union{Real, Missing}}}}},
                          hash::Val{false},
                          groups::Union{Vector{Int}, Nothing} = nothing,
                          skipmissing::Bool = false,
@@ -173,7 +244,6 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
     # and this method needs to allocate a groups vector anyway
     @assert groups !== nothing && all(col -> length(col) == length(groups), cols)
 
-    refs = map(DataAPI.refarray, cols)
     missinginds = map(refpools) do refpool
         eltype(refpool) >: Missing ?
             something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1
@@ -205,12 +275,13 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
        anydups
         # In the simplest case, we can work directly with the reference codes
         newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) ||
+                  !(refarrays isa NTuple{<:Any, AbstractVector}) ||
                   sort ||
-                  anydups ? cols : refs
+                  anydups ? cols : refarrays
         return invoke(row_group_slots,
-                      Tuple{Tuple{Vararg{AbstractVector}}, Any, Val,
+                      Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val,
                             Union{Vector{Int}, Nothing}, Bool, Bool},
-                      newcols, refpools, hash, groups, skipmissing, sort)
+                      newcols, refpools, refarrays, hash, groups, skipmissing, sort)
     end
 
     seen = fill(false, ngroups)
@@ -253,7 +324,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
         @inbounds for i in eachindex(groups)
             local refs_i
             let i=i # Workaround for julia#15276
-                refs_i = map(c -> c[i], refs)
+                refs_i = map(c -> c[i], refarrays)
             end
             vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds)
             j = sum(vals) + 1
@@ -269,7 +340,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector},
         @inbounds for i in eachindex(groups)
             local refs_i
             let i=i # Workaround for julia#15276
-                refs_i = map(refs, missinginds) do ref, missingind
+                refs_i = map(refarrays, missinginds) do ref, missingind
                     r = Int(ref[i])
                     if skipmissing
                         return r == missingind ? -1 : (r > missingind ? r-1 : r)
@@ -322,7 +393,7 @@ function compute_indices(groups::AbstractVector{<:Integer}, ngroups::Integer)
 
     # group start positions in a sorted table
     starts = Vector{Int}(undef, ngroups+1)
-    if length(starts) > 1
+    if length(starts) > 0
         starts[1] = 1
         @inbounds for i in 1:ngroups
             starts[i+1] = starts[i] + stops[i]

diff --git a/test/data.jl b/test/data.jl
@@ -76,15 +76,14 @@ const ≅ = isequal
     d3 = randn(N)
     d4 = randn(N)
     df7 = DataFrame([d1, d2, d3], [:d1, :d2, :d3])
-    ref_d1 = unique(d1)
 
     #test_group("groupby")
     gd = groupby(df7, :d1)
     @test length(gd) == 2
-    @test gd[1][:, :d2] ≅ d2[d1 .== ref_d1[1]]
-    @test gd[2][:, :d2] ≅ d2[d1 .== ref_d1[2]]
-    @test gd[1][:, :d3] == d3[d1 .== ref_d1[1]]
-    @test gd[2][:, :d3] == d3[d1 .== ref_d1[2]]
+    @test gd[1][:, :d2] ≅ d2[d1 .== 1]
+    @test gd[2][:, :d2] ≅ d2[d1 .== 2]
+    @test gd[1][:, :d3] == d3[d1 .== 1]
+    @test gd[2][:, :d3] == d3[d1 .== 2]
 
     g1 = groupby(df7, [:d1, :d2])
     g2 = groupby(df7, [:d2, :d1])