From 289fadfd752d22b6bec3d9450d46a95e25eda724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 11 Jun 2021 19:50:59 +0200 Subject: [PATCH 01/13] add setindex! rules --- src/subdataframe/subdataframe.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 6b2b308aad..7001c27e3b 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -181,8 +181,16 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id setindex!(sdf, val, idx[1], idx[2]) end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) - parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val - return sdf + if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && + && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) + T = eltype(val) + newcol = Tables.allocatecolumn(Union{T, Missing}, n) + fill!(newcol, missing) + view(newcol, rows(sdf)) = val + else + parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val + end +return sdf end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) From c6bee1be273819e4ab46a8ed1372c27c574f7489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 18 Jun 2021 14:01:27 +0200 Subject: [PATCH 02/13] fix float grouping bug --- NEWS.md | 2 ++ src/groupeddataframe/utils.jl | 3 +-- test/grouping.jl | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 243e5da8c4..491b041f2a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ ## Bug fixes +* fix bug in how `groupby` handles grouping of float columns + ([#XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX)) * fix bug in how `issorted` handles custom orderings and improve performance of sorting when complex custom orderings are passed ([#2746](https://github.com/JuliaData/DataFrames.jl/pull/2746)) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 52d1486bba..8e8585285f 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -143,8 +143,7 @@ function refpool_and_array(x::AbstractArray) else return nothing, nothing end - elseif x isa AbstractArray{<:Union{Real, Missing}} && - all(v -> ismissing(v) | isinteger(v), x) && + elseif x isa AbstractArray{<:Union{Integer, Missing}} && !isempty(skipmissing(x)) minval, maxval = extrema(skipmissing(x)) ngroups = big(maxval) - big(minval) + 1 diff --git a/test/grouping.jl b/test/grouping.jl index 95c3624098..00cc5d3687 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3710,4 +3710,8 @@ end end end +@testset "grouping floats" begin + @test length(groupby_checked(DataFrame(a=[0.0, -0.0]), :a)) == 2 +end + end # module From 9c9ed1153030d54afed2b17bb9ce0ca5df691a30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 18 Jun 2021 14:03:53 +0200 Subject: [PATCH 03/13] Revert "add setindex! rules" This reverts commit 289fadfd752d22b6bec3d9450d46a95e25eda724. --- src/subdataframe/subdataframe.jl | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 7001c27e3b..6b2b308aad 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -181,16 +181,8 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id setindex!(sdf, val, idx[1], idx[2]) end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any) - if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index && - && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val) - T = eltype(val) - newcol = Tables.allocatecolumn(Union{T, Missing}, n) - fill!(newcol, missing) - view(newcol, rows(sdf)) = val - else - parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val - end -return sdf + parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val + return sdf end Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any) throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed")) From d57eff874eef15d044d41558c06aa3148447f3de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 18 Jun 2021 14:04:24 +0200 Subject: [PATCH 04/13] fix version --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 491b041f2a..815906bc02 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,7 +15,7 @@ ## Bug fixes * fix bug in how `groupby` handles grouping of float columns - ([#XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX)) + ([#2791](https://github.com/JuliaData/DataFrames.jl/pull/2791)) * fix bug in how `issorted` handles custom orderings and improve performance of sorting when complex custom orderings are passed ([#2746](https://github.com/JuliaData/DataFrames.jl/pull/2746)) From 3641c990cf36768cbd8efdeda85234b9e2006485 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 18 Jun 2021 16:07:09 +0200 Subject: [PATCH 05/13] fix tests --- test/grouping.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 5b29d5366e..ee6055f406 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -751,8 +751,8 @@ end # Check optimized approach based on refpool method for sm in (false, true), - S in (Int, Float64), - T in (Int, Float64), + S in (Int, BigInt), + T in (Int, BigInt), df in (DataFrame(x=rand(1:10, 1000), y=rand(-3:10, 1000), z=rand(1000)), DataFrame(x=rand([1:10; missing], 1000), @@ -792,8 +792,8 @@ end # Check fallback to hash table method when range is too wide for sm in (false, true), - S in (Int, Float64), - T in (Int, Float64), + S in (Int, Int32), + T in (Int, Int32), df in (DataFrame(x=rand(1:100_000, 100), y=rand(-50:110_000, 100), z=rand(100)), DataFrame(x=rand([1:100_000; missing], 100), From 9b7e9a6cadb64af4f039987e293e9bc96be97550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 19 Jun 2021 22:33:01 +0200 Subject: [PATCH 06/13] allow fast grouping of floats --- src/groupeddataframe/utils.jl | 25 ++++++++++++++++++++++--- test/grouping.jl | 12 ++++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 8e8585285f..f59e79e515 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -143,9 +143,28 @@ function refpool_and_array(x::AbstractArray) else return nothing, nothing end - elseif x isa AbstractArray{<:Union{Integer, Missing}} && - !isempty(skipmissing(x)) - minval, maxval = extrema(skipmissing(x)) + elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 + if !(x isa AbstractArray{<:Union{Integer, Missing}}) + if !all(v -> (ismissing(v) | isinteger(v)) & (!isequal(v, -0.0)), x) + return nothing, nothing + end + end + if Missing <: eltype(x) + smx = skipmissing(x) + y = iterate(smx) + y === nothing && return nothing, nothing + (v, s) = y + minval, maxval = v + while true + y = iterate(smx, s) + y === nothing && break + (v, s) = y + maxval = max(v, maxval) + minval = min(v, minval) + end + else + minval, maxval = extrema(x) + end ngroups = big(maxval) - big(minval) + 1 # Threshold chosen with the same rationale as the row_group_slots refpool method: # refpool approach is faster but we should not allocate too much memory either diff --git a/test/grouping.jl b/test/grouping.jl index ee6055f406..2c6a93041a 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -751,8 +751,8 @@ end # Check optimized approach based on refpool method for sm in (false, true), - S in (Int, BigInt), - T in (Int, BigInt), + S in (Int, Float64), + T in (Int, Float64), df in (DataFrame(x=rand(1:10, 1000), y=rand(-3:10, 1000), z=rand(1000)), DataFrame(x=rand([1:10; missing], 1000), @@ -792,8 +792,8 @@ end # Check fallback to hash table method when range is too wide for sm in (false, true), - S in (Int, Int32), - T in (Int, Int32), + S in (Int, Float64), + T in (Int, Float64), df in (DataFrame(x=rand(1:100_000, 100), y=rand(-50:110_000, 100), z=rand(100)), DataFrame(x=rand([1:100_000; missing], 100), @@ -3712,6 +3712,10 @@ end @testset "grouping floats" begin @test length(groupby_checked(DataFrame(a=[0.0, -0.0]), :a)) == 2 + @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, 0.0]), :a)), 1) == + [0, 2, 3] + @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, -0.0]), :a)), 1) == + [3, 2, 0] end @testset "aggregation with matrix of Pair" begin From df6f3cd5f8b6f5bea510ac0d4ed04beaf4a3864c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 20 Jun 2021 09:55:19 +0200 Subject: [PATCH 07/13] fix typo --- src/groupeddataframe/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index f59e79e515..e19422a9dd 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -154,7 +154,7 @@ function refpool_and_array(x::AbstractArray) y = iterate(smx) y === nothing && return nothing, nothing (v, s) = y - minval, maxval = v + minval = maxval = v while true y = iterate(smx, s) y === nothing && break From 96febcf7008cb4885bf15ee960555f79b52e45cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Jun 2021 20:47:02 +0200 Subject: [PATCH 08/13] Update src/groupeddataframe/utils.jl Co-authored-by: Milan Bouchet-Valat --- src/groupeddataframe/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index e19422a9dd..da2335cad2 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -145,7 +145,7 @@ function refpool_and_array(x::AbstractArray) end elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 if !(x isa AbstractArray{<:Union{Integer, Missing}}) - if !all(v -> (ismissing(v) | isinteger(v)) & (!isequal(v, -0.0)), x) + if !all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x) return nothing, nothing end end From b3ab07dd2e9c422e78d69331d0a84acdde7c0864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Jun 2021 20:47:16 +0200 Subject: [PATCH 09/13] update news --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5da977307c..9cecc65eed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,7 +14,9 @@ ## Bug fixes -* fix bug in how `groupby` handles grouping of float columns +* fix bug in how `groupby` handles grouping of float columns; + now `-0.0` is treated as *not integer* when deciding on which + grouping algorithm should be used ([#2791](https://github.com/JuliaData/DataFrames.jl/pull/2791)) * fix bug in how `issorted` handles custom orderings and improve performance of sorting when complex custom orderings are passed From c5173f1e32590c9a1b3bfa6bb4d3c99cab72f421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 26 Jun 2021 20:56:31 +0200 Subject: [PATCH 10/13] code cleanup after the review --- src/groupeddataframe/groupeddataframe.jl | 5 +++++ src/groupeddataframe/utils.jl | 14 ++++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index df8502209a..a0036ea3e7 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -79,6 +79,11 @@ an `AbstractDict` can be used to index into a grouped data frame where the keys are column names of the data frame. The order of the keys does not matter in this case. +A column is considered to be an integer column when deciding on the grouping +algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`, all its +elements are either `missing` or pass `isinteger` test, and none of them is +equal to `-0.0`. + # See also [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index da2335cad2..a750d14cf0 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -151,16 +151,10 @@ function refpool_and_array(x::AbstractArray) end if Missing <: eltype(x) smx = skipmissing(x) - y = iterate(smx) - y === nothing && return nothing, nothing - (v, s) = y - minval = maxval = v - while true - y = iterate(smx, s) - y === nothing && break - (v, s) = y - maxval = max(v, maxval) - minval = min(v, minval) + if isempty(smx) + return nothing, nothing + else + minval, maxval = extrema(smx) end else minval, maxval = extrema(x) From bdd4b1c87715afe8882e92f7287a23aa87977a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 22:27:43 +0200 Subject: [PATCH 11/13] Update src/groupeddataframe/utils.jl Co-authored-by: Milan Bouchet-Valat --- src/groupeddataframe/utils.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index a750d14cf0..40963e904b 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -143,12 +143,9 @@ function refpool_and_array(x::AbstractArray) else return nothing, nothing end - elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 - if !(x isa AbstractArray{<:Union{Integer, Missing}}) - if !all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x) - return nothing, nothing - end - end + elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 && + (x isa AbstractArray{<:Union{Integer, Missing}} || + all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x)) if Missing <: eltype(x) smx = skipmissing(x) if isempty(smx) From 2b7e9f681b84a43bc4629122d7758c11cf7fe7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 27 Jun 2021 22:33:48 +0200 Subject: [PATCH 12/13] improve docstring --- src/groupeddataframe/groupeddataframe.jl | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index a0036ea3e7..d703d340f7 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -45,14 +45,8 @@ into row groups. ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). - `sort` : whether to sort groups according to the values of the grouping columns `cols`; if `sort=false` then the order of groups in the result is undefined - and may change in future releases. In the current implementation - groups are ordered following the order of appearance of values in the grouping - columns, except when all grouping columns provide non-`nothing` - `DataAPI.refpool` in which case the order of groups follows the order of - values returned by `DataAPI.refpool`. As a particular application of this rule - if all `cols` are `CategoricalVector`s then groups are always sorted - Integer columns with a narrow range also use this this optimization, so - to the order of groups when grouping on integer columns is undefined. + and may change in future releases; below a description of the current + implementation is provided. - `skipmissing` : whether to skip groups with `missing` values in one of the grouping columns `cols` @@ -79,10 +73,17 @@ an `AbstractDict` can be used to index into a grouped data frame where the keys are column names of the data frame. The order of the keys does not matter in this case. +In the current implementation if `sort=false` groups are ordered following the +order of appearance of values in the grouping columns, except when all grouping +columns provide non-`nothing` `DataAPI.refpool` in which case the order of groups +follows the order of values returned by `DataAPI.refpool`. As a particular application +of this rule if all `cols` are `CategoricalVector`s then groups are always sorted. +Integer columns with a narrow range also use this this optimization, so to the +order of groups when grouping on integer columns is undefined. A column is considered to be an integer column when deciding on the grouping -algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`, all its -elements are either `missing` or pass `isinteger` test, and none of them is -equal to `-0.0`. +algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`, +all its elements are either `missing` or pass `isinteger` test, +and none of them is equal to `-0.0`. # See also From aee506a1d1832d573d322f49b618508657780646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 28 Jun 2021 11:53:25 +0200 Subject: [PATCH 13/13] Update src/groupeddataframe/groupeddataframe.jl Co-authored-by: Milan Bouchet-Valat --- src/groupeddataframe/groupeddataframe.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index d703d340f7..14f6ff2487 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -75,7 +75,7 @@ not matter in this case. In the current implementation if `sort=false` groups are ordered following the order of appearance of values in the grouping columns, except when all grouping -columns provide non-`nothing` `DataAPI.refpool` in which case the order of groups +columns provide non-`nothing` `DataAPI.refpool`, in which case the order of groups follows the order of values returned by `DataAPI.refpool`. As a particular application of this rule if all `cols` are `CategoricalVector`s then groups are always sorted. Integer columns with a narrow range also use this this optimization, so to the