From 289fadfd752d22b6bec3d9450d46a95e25eda724 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 11 Jun 2021 19:50:59 +0200
Subject: [PATCH 01/13] add setindex! rules

---
 src/subdataframe/subdataframe.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl
index 6b2b308aad..7001c27e3b 100644
--- a/src/subdataframe/subdataframe.jl
+++ b/src/subdataframe/subdataframe.jl
@@ -181,8 +181,16 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id
     setindex!(sdf, val, idx[1], idx[2])
 end
 Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any)
-    parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val
-    return sdf
+    if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index &&
+       && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val)
+        T = eltype(val)
+        newcol = Tables.allocatecolumn(Union{T, Missing}, n)
+        fill!(newcol, missing)
+        view(newcol, rows(sdf)) = val
+    else
+        parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val
+    end
+return sdf
 end
 Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any)
     throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed"))

From c6bee1be273819e4ab46a8ed1372c27c574f7489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 18 Jun 2021 14:01:27 +0200
Subject: [PATCH 02/13] fix float grouping bug

---
 NEWS.md                       | 2 ++
 src/groupeddataframe/utils.jl | 3 +--
 test/grouping.jl              | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 243e5da8c4..491b041f2a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -14,6 +14,8 @@
 
 ## Bug fixes
 
+* fix bug in how `groupby` handles grouping of float columns
+  ([#XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX))
 * fix bug in how `issorted` handles custom orderings and improve performance
   of sorting when complex custom orderings are passed
   ([#2746](https://github.com/JuliaData/DataFrames.jl/pull/2746))
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index 52d1486bba..8e8585285f 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -143,8 +143,7 @@ function refpool_and_array(x::AbstractArray)
         else
             return nothing, nothing
         end
-    elseif x isa AbstractArray{<:Union{Real, Missing}} &&
-        all(v -> ismissing(v) | isinteger(v), x) &&
+    elseif x isa AbstractArray{<:Union{Integer, Missing}} &&
         !isempty(skipmissing(x))
         minval, maxval = extrema(skipmissing(x))
         ngroups = big(maxval) - big(minval) + 1
diff --git a/test/grouping.jl b/test/grouping.jl
index 95c3624098..00cc5d3687 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3710,4 +3710,8 @@ end
     end
 end
 
+@testset "grouping floats" begin
+    @test length(groupby_checked(DataFrame(a=[0.0, -0.0]), :a)) == 2
+end
+
 end # module

From 9c9ed1153030d54afed2b17bb9ce0ca5df691a30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 18 Jun 2021 14:03:53 +0200
Subject: [PATCH 03/13] Revert "add setindex! rules"

This reverts commit 289fadfd752d22b6bec3d9450d46a95e25eda724.
---
 src/subdataframe/subdataframe.jl | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl
index 7001c27e3b..6b2b308aad 100644
--- a/src/subdataframe/subdataframe.jl
+++ b/src/subdataframe/subdataframe.jl
@@ -181,16 +181,8 @@ Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, id
     setindex!(sdf, val, idx[1], idx[2])
 end
 Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::Colon, colinds::Any)
-    if colinds isa SymbolOrString && getfield(sdf, :colindex) isa Index &&
-       && val isa AbstractVector && columnindex(sdf, colinds) == 0 && nrow(sdf) == length(val)
-        T = eltype(val)
-        newcol = Tables.allocatecolumn(Union{T, Missing}, n)
-        fill!(newcol, missing)
-        view(newcol, rows(sdf)) = val
-    else
-        parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val
-    end
-return sdf
+    parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] = val
+    return sdf
 end
 Base.@propagate_inbounds function Base.setindex!(sdf::SubDataFrame, val::Any, ::typeof(!), colinds::Any)
     throw(ArgumentError("setting index of SubDataFrame using ! as row selector is not allowed"))

From d57eff874eef15d044d41558c06aa3148447f3de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 18 Jun 2021 14:04:24 +0200
Subject: [PATCH 04/13] fix version

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 491b041f2a..815906bc02 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,7 +15,7 @@
 ## Bug fixes
 
 * fix bug in how `groupby` handles grouping of float columns
-  ([#XXXX](https://github.com/JuliaData/DataFrames.jl/pull/XXXX))
+  ([#2791](https://github.com/JuliaData/DataFrames.jl/pull/2791))
 * fix bug in how `issorted` handles custom orderings and improve performance
   of sorting when complex custom orderings are passed
   ([#2746](https://github.com/JuliaData/DataFrames.jl/pull/2746))

From 3641c990cf36768cbd8efdeda85234b9e2006485 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 18 Jun 2021 16:07:09 +0200
Subject: [PATCH 05/13] fix tests

---
 test/grouping.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/grouping.jl b/test/grouping.jl
index 5b29d5366e..ee6055f406 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -751,8 +751,8 @@ end
 
     # Check optimized approach based on refpool method
     for sm in (false, true),
-        S in (Int, Float64),
-        T in (Int, Float64),
+        S in (Int, BigInt),
+        T in (Int, BigInt),
         df in (DataFrame(x=rand(1:10, 1000),
                          y=rand(-3:10, 1000), z=rand(1000)),
                DataFrame(x=rand([1:10; missing], 1000),
@@ -792,8 +792,8 @@ end
 
     # Check fallback to hash table method when range is too wide
     for sm in (false, true),
-        S in (Int, Float64),
-        T in (Int, Float64),
+        S in (Int, Int32),
+        T in (Int, Int32),
         df in (DataFrame(x=rand(1:100_000, 100),
                          y=rand(-50:110_000, 100), z=rand(100)),
                DataFrame(x=rand([1:100_000; missing], 100),

From 9b7e9a6cadb64af4f039987e293e9bc96be97550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 19 Jun 2021 22:33:01 +0200
Subject: [PATCH 06/13] allow fast grouping of floats

---
 src/groupeddataframe/utils.jl | 25 ++++++++++++++++++++++---
 test/grouping.jl              | 12 ++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index 8e8585285f..f59e79e515 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -143,9 +143,28 @@ function refpool_and_array(x::AbstractArray)
         else
             return nothing, nothing
         end
-    elseif x isa AbstractArray{<:Union{Integer, Missing}} &&
-        !isempty(skipmissing(x))
-        minval, maxval = extrema(skipmissing(x))
+    elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0
+        if !(x isa AbstractArray{<:Union{Integer, Missing}})
+            if !all(v -> (ismissing(v) | isinteger(v)) & (!isequal(v, -0.0)), x)
+                return nothing, nothing
+            end
+        end
+        if Missing <: eltype(x)
+            smx = skipmissing(x)
+            y = iterate(smx)
+            y === nothing && return nothing, nothing
+            (v, s) = y
+            minval, maxval = v
+            while true
+                y = iterate(smx, s)
+                y === nothing && break
+                (v, s) = y
+                maxval = max(v, maxval)
+                minval = min(v, minval)
+            end
+        else
+            minval, maxval = extrema(x)
+        end
         ngroups = big(maxval) - big(minval) + 1
         # Threshold chosen with the same rationale as the row_group_slots refpool method:
         # refpool approach is faster but we should not allocate too much memory either
diff --git a/test/grouping.jl b/test/grouping.jl
index ee6055f406..2c6a93041a 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -751,8 +751,8 @@ end
 
     # Check optimized approach based on refpool method
     for sm in (false, true),
-        S in (Int, BigInt),
-        T in (Int, BigInt),
+        S in (Int, Float64),
+        T in (Int, Float64),
         df in (DataFrame(x=rand(1:10, 1000),
                          y=rand(-3:10, 1000), z=rand(1000)),
                DataFrame(x=rand([1:10; missing], 1000),
@@ -792,8 +792,8 @@ end
 
     # Check fallback to hash table method when range is too wide
     for sm in (false, true),
-        S in (Int, Int32),
-        T in (Int, Int32),
+        S in (Int, Float64),
+        T in (Int, Float64),
         df in (DataFrame(x=rand(1:100_000, 100),
                          y=rand(-50:110_000, 100), z=rand(100)),
                DataFrame(x=rand([1:100_000; missing], 100),
@@ -3712,6 +3712,10 @@ end
 
 @testset "grouping floats" begin
     @test length(groupby_checked(DataFrame(a=[0.0, -0.0]), :a)) == 2
+    @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, 0.0]), :a)), 1) ==
+          [0, 2, 3]
+    @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, -0.0]), :a)), 1) ==
+          [3, 2, 0]
 end
 
 @testset "aggregation with matrix of Pair" begin

From df6f3cd5f8b6f5bea510ac0d4ed04beaf4a3864c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 20 Jun 2021 09:55:19 +0200
Subject: [PATCH 07/13] fix typo

---
 src/groupeddataframe/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index f59e79e515..e19422a9dd 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -154,7 +154,7 @@ function refpool_and_array(x::AbstractArray)
             y = iterate(smx)
             y === nothing && return nothing, nothing
             (v, s) = y
-            minval, maxval = v
+            minval = maxval = v
             while true
                 y = iterate(smx, s)
                 y === nothing && break

From 96febcf7008cb4885bf15ee960555f79b52e45cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Jun 2021 20:47:02 +0200
Subject: [PATCH 08/13] Update src/groupeddataframe/utils.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/groupeddataframe/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index e19422a9dd..da2335cad2 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -145,7 +145,7 @@ function refpool_and_array(x::AbstractArray)
         end
     elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0
         if !(x isa AbstractArray{<:Union{Integer, Missing}})
-            if !all(v -> (ismissing(v) | isinteger(v)) & (!isequal(v, -0.0)), x)
+            if !all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x)
                 return nothing, nothing
             end
         end

From b3ab07dd2e9c422e78d69331d0a84acdde7c0864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Jun 2021 20:47:16 +0200
Subject: [PATCH 09/13] update news

---
 NEWS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 5da977307c..9cecc65eed 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -14,7 +14,9 @@
 
 ## Bug fixes
 
-* fix bug in how `groupby` handles grouping of float columns
+* fix bug in how `groupby` handles grouping of float columns;
+  now `-0.0` is treated as *not integer* when deciding on which
+  grouping algorithm should be used
   ([#2791](https://github.com/JuliaData/DataFrames.jl/pull/2791))
 * fix bug in how `issorted` handles custom orderings and improve performance
   of sorting when complex custom orderings are passed

From c5173f1e32590c9a1b3bfa6bb4d3c99cab72f421 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 26 Jun 2021 20:56:31 +0200
Subject: [PATCH 10/13] code cleanup after the review

---
 src/groupeddataframe/groupeddataframe.jl |  5 +++++
 src/groupeddataframe/utils.jl            | 14 ++++----------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index df8502209a..a0036ea3e7 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -79,6 +79,11 @@ an `AbstractDict` can be used to index into a grouped data frame where
 the keys are column names of the data frame. The order of the keys does
 not matter in this case.
 
+A column is considered to be an integer column when deciding on the grouping
+algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`, all its
+elements are either `missing` or pass `isinteger` test, and none of them is
+equal to `-0.0`.
+
 # See also
 
 [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref)
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index da2335cad2..a750d14cf0 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -151,16 +151,10 @@ function refpool_and_array(x::AbstractArray)
         end
         if Missing <: eltype(x)
             smx = skipmissing(x)
-            y = iterate(smx)
-            y === nothing && return nothing, nothing
-            (v, s) = y
-            minval = maxval = v
-            while true
-                y = iterate(smx, s)
-                y === nothing && break
-                (v, s) = y
-                maxval = max(v, maxval)
-                minval = min(v, minval)
+            if isempty(smx)
+                return nothing, nothing
+            else
+                minval, maxval = extrema(smx)
             end
         else
             minval, maxval = extrema(x)

From bdd4b1c87715afe8882e92f7287a23aa87977a0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 27 Jun 2021 22:27:43 +0200
Subject: [PATCH 11/13] Update src/groupeddataframe/utils.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/groupeddataframe/utils.jl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
index a750d14cf0..40963e904b 100644
--- a/src/groupeddataframe/utils.jl
+++ b/src/groupeddataframe/utils.jl
@@ -143,12 +143,9 @@ function refpool_and_array(x::AbstractArray)
         else
             return nothing, nothing
         end
-    elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0
-        if !(x isa AbstractArray{<:Union{Integer, Missing}})
-            if !all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x)
-                return nothing, nothing
-            end
-        end
+    elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 &&
+        (x isa AbstractArray{<:Union{Integer, Missing}} ||
+         all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x))
         if Missing <: eltype(x)
             smx = skipmissing(x)
             if isempty(smx)

From 2b7e9f681b84a43bc4629122d7758c11cf7fe7cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 27 Jun 2021 22:33:48 +0200
Subject: [PATCH 12/13] improve docstring

---
 src/groupeddataframe/groupeddataframe.jl | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index a0036ea3e7..d703d340f7 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -45,14 +45,8 @@ into row groups.
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
 - `sort` : whether to sort groups according to the values of the grouping columns
   `cols`; if `sort=false` then the order of groups in the result is undefined
-  and may change in future releases. In the current implementation
-  groups are ordered following the order of appearance of values in the grouping
-  columns, except when all grouping columns provide non-`nothing`
-  `DataAPI.refpool` in which case the order of groups follows the order of
-  values returned by `DataAPI.refpool`. As a particular application of this rule
-  if all `cols` are `CategoricalVector`s then groups are always sorted
-  Integer columns with a narrow range also use this this optimization, so
-  to the order of groups when grouping on integer columns is undefined.
+  and may change in future releases; below a description of the current
+  implementation is provided.
 - `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
 
@@ -79,10 +73,17 @@ an `AbstractDict` can be used to index into a grouped data frame where
 the keys are column names of the data frame. The order of the keys does
 not matter in this case.
 
+In the current implementation if `sort=false` groups are ordered following the
+order of appearance of values in the grouping columns, except when all grouping
+columns provide non-`nothing` `DataAPI.refpool` in which case the order of groups
+follows the order of values returned by `DataAPI.refpool`. As a particular application
+of this rule if all `cols` are `CategoricalVector`s then groups are always sorted.
+Integer columns with a narrow range also use this this optimization, so to the
+order of groups when grouping on integer columns is undefined.
 A column is considered to be an integer column when deciding on the grouping
-algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`, all its
-elements are either `missing` or pass `isinteger` test, and none of them is
-equal to `-0.0`.
+algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`,
+all its elements are either `missing` or pass `isinteger` test,
+and none of them is equal to `-0.0`.
 
 # See also
 

From aee506a1d1832d573d322f49b618508657780646 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 28 Jun 2021 11:53:25 +0200
Subject: [PATCH 13/13] Update src/groupeddataframe/groupeddataframe.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/groupeddataframe/groupeddataframe.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index d703d340f7..14f6ff2487 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -75,7 +75,7 @@ not matter in this case.
 
 In the current implementation if `sort=false` groups are ordered following the
 order of appearance of values in the grouping columns, except when all grouping
-columns provide non-`nothing` `DataAPI.refpool` in which case the order of groups
+columns provide non-`nothing` `DataAPI.refpool`, in which case the order of groups
 follows the order of values returned by `DataAPI.refpool`. As a particular application
 of this rule if all `cols` are `CategoricalVector`s then groups are always sorted.
 Integer columns with a narrow range also use this this optimization, so to the