From f05fc73f568db191e51f32e0a562ca7e81af5336 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 31 Jan 2021 21:06:46 +0100 Subject: [PATCH] Use refpool optimized method for integer grouping (#2610) --- src/dataframerow/utils.jl | 124 ++++++++++++++++--- test/data.jl | 9 +- test/grouping.jl | 251 +++++++++++++++++++++++++++----------- 3 files changed, 293 insertions(+), 91 deletions(-) diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index ea62c8bf66..4d16e36e72 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -45,7 +45,7 @@ function hashrows_col!(h::Vector{UInt}, # which is always zero # also when the number of values in the pool is more than half the length # of the vector avoid using this path. 50% is roughly based on benchmarks - if firstcol && 2 * length(rp) < length(v) + if firstcol && Int64(2) * length(rp) < length(v) hashes = Vector{UInt}(undef, length(rp)) @inbounds for (i, v) in zip(eachindex(hashes), rp) hashes[i] = hash(v) @@ -94,6 +94,85 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int, isequal(cols1[1][r1], cols2[1][r2]) && isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2) +# IntegerRefarray and IntegerRefPool are two complementary view types that allow +# wrapping arrays with Union{Real, Missing} eltype to satisfy the DataAPI.refpool +# and DataAPI.refarray API when calling row_group_slots. +# IntegerRefarray converts values to Int and replaces missing with an integer +# (set by the caller to the maximum value + 1) +# IntegerRefPool subtracts the minimum value - 1 and replaces back the maximum +# value + 1 to missing. This ensures all values are in 1:length(refpool), while +# row_group_slots knows the number of (potential) groups via length(refpool) +# and is able to skip missing values when skipmissing=true + +struct IntegerRefarray{T<:AbstractArray} <: AbstractVector{Int} + x::T + offset::Int + replacement::Int +end + +Base.size(x::IntegerRefarray) = size(x.x) +Base.axes(x::IntegerRefarray) = axes(x.x) +Base.IndexStyle(::Type{<:IntegerRefarray{T}}) where {T} = Base.IndexStyle(T) +@inline function Base.getindex(x::IntegerRefarray, i) + @boundscheck checkbounds(x.x, i) + @inbounds v = x.x[i] + if eltype(x.x) >: Missing && v === missing + return x.replacement + else + # Overflow is guaranteed not to happen by checks before calling the constructor + return Int(v - x.offset) + end +end + +struct IntegerRefpool{T<:Union{Int, Missing}} <: AbstractVector{T} + max::Int + function IntegerRefpool{T}(max::Integer) where T<:Union{Int, Missing} + @assert max < typemax(Int) # to store missing values as max + 1 + new{T}(max) + end +end + +Base.size(x::IntegerRefpool{T}) where {T} = (x.max + (T >: Missing),) +Base.axes(x::IntegerRefpool{T}) where {T} = (Base.OneTo(x.max + (T >: Missing)),) +Base.IndexStyle(::Type{<:IntegerRefpool}) = Base.IndexLinear() +@inline function Base.getindex(x::IntegerRefpool{T}, i::Real) where T + @boundscheck checkbounds(x, i) + if T >: Missing && i == x.max + 1 + return missing + else + return Int(i) + end +end +Base.allunique(::IntegerRefpool) = true +Base.issorted(::IntegerRefpool) = true + +function refpool_and_array(x::AbstractArray) + refpool = DataAPI.refpool(x) + refarray = DataAPI.refarray(x) + + if refpool !== nothing + return refpool, refarray + elseif x isa AbstractArray{<:Union{Real, Missing}} && + all(v -> ismissing(v) | isinteger(v), x) && + !isempty(skipmissing(x)) + minval, maxval = extrema(skipmissing(x)) + ngroups = big(maxval) - big(minval) + 1 + # Threshold chosen with the same rationale as the row_group_slots refpool method: + # refpool approach is faster but we should not allocate too much memory either + # We also have to avoid overflow, including with ngroups + 1 for missing values + # (note that it would be possible to allow minval and maxval to be outside of the + # range supported by Int by adding a type parameter for minval to IntegerRefarray) + if typemin(Int) < minval <= maxval < typemax(Int) && + ngroups + 1 <= Int64(2) * length(x) <= typemax(Int) + T = eltype(x) >: Missing ? Union{Int, Missing} : Int + refpool′ = IntegerRefpool{T}(Int(ngroups)) + refarray′ = IntegerRefarray(x, Int(minval) - 1, Int(ngroups) + 1) + return refpool′, refarray′ + end + end + return nothing, nothing +end + # Helper function for RowGroupDict. # Returns a tuple: # 1) the highest group index in the `groups` vector @@ -103,16 +182,21 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int, # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. -row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - hash::Val = Val(true), - groups::Union{Vector{Int}, Nothing} = nothing, - skipmissing::Bool = false, - sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} = - row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort) +function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, + hash::Val = Val(true), + groups::Union{Vector{Int}, Nothing} = nothing, + skipmissing::Bool = false, + sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} + rpa = refpool_and_array.(cols) + refpools = first.(rpa) + refarrays = last.(rpa) + row_group_slots(cols, refpools, refarrays, hash, groups, skipmissing, sort) +end # Generic fallback method based on open adressing hash table function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, - refpools::Any, + refpools::Any, # Ignored + refarrays::Any, # Ignored hash::Val = Val(true), groups::Union{Vector{Int}, Nothing} = nothing, skipmissing::Bool = false, @@ -163,8 +247,12 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, end # Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector -function row_group_slots(cols::NTuple{N, <:AbstractVector}, - refpools::NTuple{N, <:AbstractVector}, +function row_group_slots(cols::NTuple{N, AbstractVector}, + refpools::NTuple{N, AbstractVector}, + refarrays::NTuple{N, + Union{AbstractVector{<:Real}, + Missings.EachReplaceMissing{ + <:AbstractVector{<:Union{Real, Missing}}}}}, hash::Val{false}, groups::Union{Vector{Int}, Nothing} = nothing, skipmissing::Bool = false, @@ -173,7 +261,6 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector}, # and this method needs to allocate a groups vector anyway @assert groups !== nothing && all(col -> length(col) == length(groups), cols) - refs = map(DataAPI.refarray, cols) missinginds = map(refpools) do refpool eltype(refpool) >: Missing ? something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1 @@ -201,16 +288,17 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector}, # but it needs to remain reasonable compared with the size of the data frame. anydups = !all(allunique, refpools) if prod(big.(ngroupstup)) > typemax(Int) || - ngroups > 2 * length(groups) || + ngroups > Int64(2) * length(groups) || anydups # In the simplest case, we can work directly with the reference codes newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) || + !(refarrays isa NTuple{<:Any, AbstractVector}) || sort || - anydups ? cols : refs + anydups ? cols : refarrays return invoke(row_group_slots, - Tuple{Tuple{Vararg{AbstractVector}}, Any, Val, + Tuple{Tuple{Vararg{AbstractVector}}, Any, Any, Val, Union{Vector{Int}, Nothing}, Bool, Bool}, - newcols, refpools, hash, groups, skipmissing, sort) + newcols, refpools, refarrays, hash, groups, skipmissing, sort) end seen = fill(false, ngroups) @@ -253,7 +341,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector}, @inbounds for i in eachindex(groups) local refs_i let i=i # Workaround for julia#15276 - refs_i = map(c -> c[i], refs) + refs_i = map(c -> c[i], refarrays) end vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds) j = sum(vals) + 1 @@ -269,7 +357,7 @@ function row_group_slots(cols::NTuple{N, <:AbstractVector}, @inbounds for i in eachindex(groups) local refs_i let i=i # Workaround for julia#15276 - refs_i = map(refs, missinginds) do ref, missingind + refs_i = map(refarrays, missinginds) do ref, missingind r = Int(ref[i]) if skipmissing return r == missingind ? -1 : (r > missingind ? r-1 : r) @@ -322,7 +410,7 @@ function compute_indices(groups::AbstractVector{<:Integer}, ngroups::Integer) # group start positions in a sorted table starts = Vector{Int}(undef, ngroups+1) - if length(starts) > 1 + if length(starts) > 0 starts[1] = 1 @inbounds for i in 1:ngroups starts[i+1] = starts[i] + stops[i] diff --git a/test/data.jl b/test/data.jl index b78d2c8ef8..8cf8023b2f 100644 --- a/test/data.jl +++ b/test/data.jl @@ -76,15 +76,14 @@ const ≅ = isequal d3 = randn(N) d4 = randn(N) df7 = DataFrame([d1, d2, d3], [:d1, :d2, :d3]) - ref_d1 = unique(d1) #test_group("groupby") gd = groupby(df7, :d1) @test length(gd) == 2 - @test gd[1][:, :d2] ≅ d2[d1 .== ref_d1[1]] - @test gd[2][:, :d2] ≅ d2[d1 .== ref_d1[2]] - @test gd[1][:, :d3] == d3[d1 .== ref_d1[1]] - @test gd[2][:, :d3] == d3[d1 .== ref_d1[2]] + @test gd[1][:, :d2] ≅ d2[d1 .== 1] + @test gd[2][:, :d2] ≅ d2[d1 .== 2] + @test gd[1][:, :d3] == d3[d1 .== 1] + @test gd[2][:, :d3] == d3[d1 .== 2] g1 = groupby(df7, [:d1, :d2]) g2 = groupby(df7, [:d2, :d1]) diff --git a/test/grouping.jl b/test/grouping.jl index 276c9c2f9f..3551766845 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -166,12 +166,12 @@ end res4 = df[:, cols] res4.x2 = df.x.^2 shcatdf = sort(hcatdf, colssym) - sres = sort(res, colssym) - sres2 = sort(res2, colssym) - sres3 = sort(res3, colssym) - sres4 = sort(res4, colssym) # groupby_checked() without groups sorting + sres = sort(res) + sres2 = sort(res2) + sres3 = sort(res3) + sres4 = sort(res4) gd = groupby_checked(df, cols) @test names(parent(gd), gd.cols) == string.(colssym) df_comb = combine(identity, gd) @@ -181,16 +181,20 @@ end df_ref = DataFrame(gd) @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf @test df_ref.x == df_comb.x - @test combine(f1, gd) == res - @test combine(f2, gd) == res - @test rename(combine(f3, gd), :x1 => :xmax) == res - @test combine(f4, gd) == res2 - @test combine(f5, gd) == res2 - @test combine(f6, gd) == res3 - @test sort(combine(f7, gd), colssym) == sort(res4, colssym) - @test sort(combine(f8, gd), colssym) == sort(res4, colssym) + @test sort(combine(f1, gd)) == sres + @test sort(combine(f2, gd)) == sres + @test sort(rename(combine(f3, gd), :x1 => :xmax)) == sres + @test sort(combine(f4, gd)) == sres2 + @test sort(combine(f5, gd)) == sres2 + @test sort(combine(f6, gd)) == sres3 + @test sort(combine(f7, gd)) == sres4 + @test sort(combine(f8, gd)) == sres4 # groupby_checked() with groups sorting + sres = sort(res, colssym) + sres2 = sort(res2, colssym) + sres3 = sort(res3, colssym) + sres4 = sort(res4, colssym) gd = groupby_checked(df, cols, sort=true) @test names(parent(gd), gd.cols) == string.(colssym) for i in 1:length(gd) @@ -599,6 +603,17 @@ end @test issorted(vcat(gd...), [:Key1, :Key2]) end end + + @test groupby_checked(DataFrame(x=[missing]), :x).groups == + groupby_checked(DataFrame(x=Union{Int, Missing}[missing]), :x).groups == + groupby_checked(DataFrame(x=Union{String, Missing}[missing]), :x).groups == + groupby_checked(DataFrame(x=Any[missing]), :x).groups == [1] + @test isempty(groupby_checked(DataFrame(x=[missing]), :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=Union{Int, Missing}[missing]), + :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=Union{String, Missing}[missing]), + :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=Any[missing]), :x, skipmissing=true)) end @testset "grouping arrays that allow missing without missings" begin @@ -717,6 +732,119 @@ end end end +@testset "grouping on integer columns" begin + Random.seed!(6) + + # Check optimized approach based on refpool method + for sm in (false, true), + S in (Int, Float64), + T in (Int, Float64), + df in (DataFrame(x=rand(1:10, 1000), + y=rand(-3:10, 1000), z=rand(1000)), + DataFrame(x=rand([1:10; missing], 1000), + y=rand([1:10; missing], 1000), z=rand(1000)), + DataFrame(x=rand([1:10; missing], 1000), + y=rand(-3:10, 1000), z=rand(1000))) + df.x = convert.(Union{S, Missing}, df.x) + df.y = convert.(Union{T, Missing}, df.y) + df.x2 = passmissing(string).(df.x) + df.y2 = passmissing(string).(df.y) + gd = groupby_checked(df, :x, skipmissing=sm) + @test issorted(combine(gd, :x)) # Test that optimized method is used + @test isequal_unordered(gd, [groupby_checked(df, :x2, skipmissing=sm)...]) + gd = groupby_checked(df, [:x, :y], skipmissing=sm) + @test issorted(combine(gd, :x, :y)) # Test that optimized method is used + @test isequal_unordered(gd, [groupby_checked(df, [:x2, :y2], skipmissing=sm)...]) + end + for sm in (false, true), + v in (typemin(Int), typemax(Int) - 11), + df in (DataFrame(x=rand((1:10) .+ v, 1000), + y=rand(-3:10, 1000), z=rand(1000)), + DataFrame(x=rand([1:10; missing] .+ v, 1000), + y=rand([1:10; missing], 1000), z=rand(1000)), + DataFrame(x=rand([1:10; missing] .+ v, 1000), + y=rand(-3:10, 1000), z=rand(1000))) + df.x = allowmissing(df.x) + df.y = allowmissing(df.y) + df.x2 = passmissing(string).(df.x) + df.y2 = passmissing(string).(df.y) + gd = groupby_checked(df, :x, skipmissing=sm) + @test issorted(combine(gd, :x)) # Test that optimized method is used + @test isequal_unordered(gd, [groupby_checked(df, :x2, skipmissing=sm)...]) + gd = groupby_checked(df, [:x, :y], skipmissing=sm) + @test issorted(combine(gd, :x, :y)) # Test that optimized method is used + @test isequal_unordered(gd, [groupby_checked(df, [:x2, :y2], skipmissing=sm)...]) + end + + # Check fallback to hash table method when range is too wide + for sm in (false, true), + S in (Int, Float64), + T in (Int, Float64), + df in (DataFrame(x=rand(1:100_000, 100), + y=rand(-50:110_000, 100), z=rand(100)), + DataFrame(x=rand([1:100_000; missing], 100), + y=rand([-50:110_000; missing], 100), z=rand(100)), + DataFrame(x=rand([1:100_000; missing], 100), + y=rand(-50:110_000, 100), z=rand(100))) + df.x = convert.(Union{S, Missing}, df.x) + df.y = convert.(Union{T, Missing}, df.y) + df.x2 = passmissing(string).(df.x) + df.y2 = passmissing(string).(df.y) + gd = groupby_checked(df, :x, skipmissing=sm) + @test !issorted(combine(gd, :x)) # Test that optimized method is not used + @test isequal_unordered(gd, [groupby_checked(df, :x2, skipmissing=sm)...]) + gd = groupby_checked(df, [:x, :y], skipmissing=sm) + @test !issorted(combine(gd, :x, :y)) # Test that optimized method is not used + @test isequal_unordered(gd, [groupby_checked(df, [:x2, :y2], skipmissing=sm)...]) + end + + @test isempty(groupby_checked(DataFrame(x=Int[]), :x)) + @test isempty(groupby_checked(DataFrame(x=Union{}[]), :x)) + @test isempty(groupby_checked(DataFrame(x=Union{Int, Missing}[]), :x)) + @test groupby_checked(DataFrame(x=Union{Int, Missing}[missing]), :x) ≅ + groupby_checked(DataFrame(x=Union{String, Missing}[missing]), :x) ≅ + groupby_checked(DataFrame(x=[missing]), :x) + @test isempty(groupby_checked(DataFrame(x=Union{Int, Missing}[missing]), + skipmissing=true, :x)) + @test isempty(groupby_checked(DataFrame(x=[missing]), skipmissing=true, :x)) + + # Check Int overflow + groups = rand(1:3, 100) + for i in (0, 1, 2, 10), j in (0, 1, 2, 10), + v in (big(0), missing) + @test groupby_checked(DataFrame(x=[big(typemax(Int)) + i, v, + big(typemin(Int)) - j][groups]), :x) ≅ + groupby_checked(DataFrame(x=Any[big(typemax(Int)) + i, v, + big(typemin(Int)) - j][groups]), :x) + end + # Corner cases where overflow could happen due to additional missing values group + for i in (0, 1, 2), j in (0, 1, 2), + v in (0, missing) + @test groupby_checked(DataFrame(x=[typemax(Int) - i, v, + typemin(Int) + j][groups]), :x) ≅ + groupby_checked(DataFrame(x=Any[typemax(Int) - i, v, + typemin(Int) + j][groups]), :x) + @test groupby_checked(DataFrame(x=[typemax(Int) ÷ 2 - i, v, + typemin(Int) ÷ 2 - j][groups]), :x) ≅ + groupby_checked(DataFrame(x=Any[typemax(Int) ÷ 2 - i, v, + typemin(Int) ÷ 2 - j][groups]), :x) + end + for i in (0, 1, -1, 2, -2, 10, -10) + @test groupby_checked(DataFrame(x=fill(big(typemax(Int)) + i, 100)), :x).groups == + fill(1, 100) + end + + # Check special case of Bool + for sm in (false, true), + df in (DataFrame(x=rand(Bool, 1000), y=rand(1000)), + DataFrame(x=rand([true, false, missing], 1000), y=rand(1000))) + df.x2 = passmissing(string).(df.x) + gd = groupby_checked(df, :x, skipmissing=sm) + @test issorted(combine(gd, :x)) # Test that optimized method is used + @test isequal_unordered(gd, [groupby_checked(df, :x2, skipmissing=sm)...]) + end +end + @testset "grouping with three keys" begin # We need many rows so that optimized CategoricalArray method is used xv = rand(["A", "B", missing], 100) @@ -2135,13 +2263,12 @@ end end @testset "correct dropping of groups" begin - df = DataFrame(g = 10:-1:1) + df = DataFrame(g = 1:10) gdf = groupby_checked(df, :g) sgdf = groupby_checked(df, :g, sort=true) for keep in [[3, 2, 1], [5, 3, 1], [9], Int[]] - @test combine(gdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == - DataFrame(g=keep, keep=keep, g_function=keep) - @test combine(sgdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == + @test sort(combine(gdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[])) == + combine(sgdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == sort(DataFrame(g=keep, keep=keep, g_function=keep)) end end @@ -2200,69 +2327,63 @@ end if !(df.g isa CategoricalVector) gdf = groupby_checked(df, :g, sort=false, skipmissing=false) + @test sort(combine(gdf, :x => sum, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == - DataFrame(x_sum = [1, 5, 4]) + select(combine(gdf, :x => sum, keepkeys=true, ungroup=true), :x_sum) @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false) - @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - @test DataFrame(gdf2, keepkeys=false) == DataFrame(x_sum = [1, 5, 4]) - - @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ - DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing]) - @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) == select(DataFrame(gdf2), :x_sum) + + @test sort(combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true)) ≅ + DataFrame(x_sum = [1, 4, 5, 5], g = [3, missing, 1, 1]) + @test sort(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 2, 2, 3] - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) - @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5, 4]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ select(DataFrame(gdf2), :x_sum) + @test sort(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == - DataFrame(x_sum = [1, 5, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + select(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true), :x_sum) gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 4]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ select(DataFrame(gdf2), :x_sum) gdf = groupby_checked(df, :g, sort=false, skipmissing=true) - @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == + @test sort(combine(gdf, :x => sum, keepkeys=false, ungroup=true)) ≅ DataFrame(x_sum = [1, 5]) @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false) - @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1], x_sum = [1, 5]) + @test sort(combine(gdf, :x => sum, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ select(DataFrame(gdf2), :x_sum) - @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ + @test sort(combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true)) ≅ DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1]) - @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) + @test sort(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 2, 2] - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) - @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ select(DataFrame(gdf2), :x_sum) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == - DataFrame(x_sum = [1, 5]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ - DataFrame(g = [3, 1], x_sum = [1, 5]) + @test sort(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true)) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) ≅ + select(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true), :x_sum) gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5]) + @test sort(DataFrame(gdf2)) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ select(DataFrame(gdf2), :x_sum) end gdf = groupby_checked(df, :g, sort=true, skipmissing=false) @@ -2274,7 +2395,6 @@ end DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4]) @@ -2284,7 +2404,6 @@ end DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 1, 2, 3] @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1, 4]) @@ -2294,7 +2413,6 @@ end DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4]) @@ -2307,7 +2425,6 @@ end DataFrame(g = [1, 3], x_sum = [5, 1]) gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1]) @@ -2317,7 +2434,6 @@ end DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 1, 2] @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1]) @@ -2327,7 +2443,6 @@ end DataFrame(g = [1, 3], x_sum = [5, 1]) gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1]) end @@ -2425,8 +2540,7 @@ end @test res1.x_mean + res1.x_function ≈ df.x res2 = combine(gdf, :x => mean, :x => x -> x .- mean(x), :id) - @test unique(res2.g) == - (dosort || df.g isa CategoricalVector ? sort! : identity)(unique(df.g)) + @test unique(res2.g) == sort(unique(df.g)) for i in unique(res2.g) @test issorted(filter(:g => x -> x == i, res2).id) end @@ -3153,11 +3267,12 @@ end @test transform(df, :x => x -> 2x) == transform(gdf, :x => x -> 2x) @test transform(df, identity) == transform(gdf, identity) @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) - @test combine(gdf, :x => x -> 2x) == - DataFrame(id=[1, 1, 3, 3, 2, 2], x_function=[6, 10, 2, 8, 4, 12]) - @test combine(gdf, identity) == DataFrame(id=[1, 1, 3, 3, 2, 2], x=[3, 5, 1, 4, 2, 6]) - @test combine(gdf, x -> (a=x.x, b=x.x)) == - DataFrame(id=[1, 1, 3, 3, 2, 2], a=[3, 5, 1, 4, 2, 6], b=[3, 5, 1, 4, 2, 6]) + @test sort(combine(gdf, :x => x -> 2x)) == + DataFrame(id=[1, 1, 2, 2, 3, 3], x_function=[6, 10, 4, 12, 2, 8]) + @test sort(combine(gdf, identity)) == + DataFrame(id=[1, 1, 2, 2, 3, 3], x=[3, 5, 2, 6, 1, 4]) + @test sort(combine(gdf, x -> (a=x.x, b=x.x))) == + DataFrame(id=[1, 1, 2, 2, 3, 3], a=[3, 5, 2, 6, 1, 4], b=[3, 5, 2, 6, 1, 4]) end @testset "basic tests of advanced rules with multicolumn output" begin