Fix groupreduce with var and std for Unitful types (#2601)

For `Number`s for which squaring changes the type, we need to use different types for the mean, the variance and the standard deviation.
JuliaData · Jan 21, 2021 · 34e53e0 · 34e53e0
1 parent cf4f3ab
commit 34e53e0
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 35 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "DataFrames"
 uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-version = "0.22.1"
+version = "0.22.3"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -36,6 +36,7 @@ Reexport = "0.1, 0.2, 1.0"
 SortingAlgorithms = "0.1, 0.2, 0.3"
 TableTraits = "0.4, 1"
 Tables = "1.2"
+Unitful = "1"
 julia = "1"
 
 [extras]
@@ -46,6 +47,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 
 [targets]
-test = ["Combinatorics", "DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]
+test = ["Combinatorics", "DataStructures", "DataValues", "Dates", "Logging", "Random", "Test", "Unitful"]
diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
@@ -251,24 +251,23 @@ end
 
 function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame)
     means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd)
+    z = zero(eltype(incol)) - zero(eltype(means))
+    S = typeof((abs2(z) + abs2(z))/2)
     # !ismissing check is purely an optimization to avoid a copy later
-    if eltype(means) >: Missing && agg.condf !== !ismissing
-        T = Union{Missing, real(eltype(means))}
-    else
-        T = real(eltype(means))
-    end
+    T = eltype(incol) >: Missing && agg.condf !== !ismissing ?
+        T = Union{Missing, S} : S
     res = zeros(T, length(gd))
     return groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf,
-                        (x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1),
+                        (x, l) -> l <= 1 ? x/0 : x/(l-1),
                         false, incol, gd)
 end
 
 function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame)
     outcol = Aggregate(var, agg.condf)(incol, gd)
-    if eltype(outcol) <: Union{Missing, Rational}
-        return sqrt.(outcol)
-    else
+    if typeof(sqrt(zero(eltype(outcol)))) === eltype(outcol)
         return map!(sqrt, outcol, outcol)
+    else
+        return map(sqrt, outcol)
     end
 end
 

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -1,7 +1,7 @@
 module TestGrouping
 
 using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI,
-    Combinatorics
+    Combinatorics, Unitful
 const ≅ = isequal
 
 """Check if passed data frames are `isequal` and have the same element types of columns"""
@@ -905,8 +905,11 @@ Base.isless(::TestType, ::TestType) = false
 @testset "combine with aggregation functions (skipmissing=$skip, sort=$sort, indices=$indices)" for
     skip in (false, true), sort in (false, true), indices in (false, true)
     Random.seed!(1)
-    df = DataFrame(a = rand([1:5;missing], 20), x1 = rand(1:100, 20),
-                   x2 = rand(1:100, 20) +im*rand(1:100, 20))
+    # 5 is there to ensure we test a single-row group
+    df = DataFrame(a = [rand([1:4;missing], 19); 5],
+                   x1 = rand(1:100, 20),
+                   x2 = rand(1:100, 20) + im*rand(1:100, 20),
+                   x3 = rand(1:100, 20) .* u"m")
 
     for f in (sum, prod, maximum, minimum, mean, var, std, first, last, length)
         gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
@@ -919,39 +922,39 @@ Base.isless(::TestType, ::TestType) = false
 
         for T in (Union{Missing, Int}, Union{Int, Int8},
                   Union{Missing, Int, Int8})
-            df.x3 = Vector{T}(df.x1)
+            df.x1u = Vector{T}(df.x1)
             gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
             indices && @test gd.idx !== nothing # Trigger computation of indices
-            res = combine(gd, :x3 => f => :y)
-            expected = combine(gd, :x3 => (x -> f(x)) => :y)
+            res = combine(gd, :x1u => f => :y)
+            expected = combine(gd, :x1u => (x -> f(x)) => :y)
             @test res ≅ expected
             @test typeof(res.y) == typeof(expected.y)
         end
 
         f === length && continue
 
-        df.x3 = allowmissing(df.x1)
-        df.x3[1] = missing
+        df.x1m = allowmissing(df.x1)
+        df.x1m[1] = missing
         gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
         indices && @test gd.idx !== nothing # Trigger computation of indices
-        res = combine(gd, :x3 => f => :y)
-        expected = combine(gd, :x3 => (x -> f(x)) => :y)
+        res = combine(gd, :x1m => f => :y)
+        expected = combine(gd, :x1m => (x -> f(x)) => :y)
         @test res ≅ expected
         @test typeof(res.y) == typeof(expected.y)
-        res = combine(gd, :x3 => f∘skipmissing => :y)
-        expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
+        res = combine(gd, :x1m => f∘skipmissing => :y)
+        expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
         @test res ≅ expected
         @test typeof(res.y) == typeof(expected.y)
 
         # Test reduction over group with only missing values
         gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
         indices && @test gd.idx !== nothing # Trigger computation of indices
-        gd[1][:, :x3] .= missing
+        gd[1][:, :x1m] .= missing
         if f in (maximum, minimum, first, last)
-            @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y)
+            @test_throws ArgumentError combine(gd, :x1m => f∘skipmissing => :y)
         else
-            res = combine(gd, :x3 => f∘skipmissing => :y)
-            expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
+            res = combine(gd, :x1m => f∘skipmissing => :y)
+            expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
             @test res ≅ expected
             @test typeof(res.y) == typeof(expected.y)
         end
@@ -965,29 +968,95 @@ Base.isless(::TestType, ::TestType) = false
         expected = combine(gd, :x2 => (x -> f(x)) => :y)
         @test res ≅ expected
         @test typeof(res.y) == typeof(expected.y)
+
+        f === length && continue
+
+        df.x2m = allowmissing(df.x1)
+        df.x2m[1] = missing
+        gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
+        indices && @test gd.idx !== nothing # Trigger computation of indices
+        res = combine(gd, :x2m => f => :y)
+        expected = combine(gd, :x2m => (x -> f(x)) => :y)
+        @test res ≅ expected
+        @test typeof(res.y) == typeof(expected.y)
+        res = combine(gd, :x2m => f∘skipmissing => :y)
+        expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
+        @test res ≅ expected
+        @test typeof(res.y) == typeof(expected.y)
+
+        # Test reduction over group with only missing values
+        gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
+        indices && @test gd.idx !== nothing # Trigger computation of indices
+        gd[1][:, :x2m] .= missing
+        if f in (maximum, minimum, first, last)
+            @test_throws ArgumentError combine(gd, :x2m => f∘skipmissing => :y)
+        else
+            res = combine(gd, :x2m => f∘skipmissing => :y)
+            expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
+            @test res ≅ expected
+            @test typeof(res.y) == typeof(expected.y)
+        end
+    end
+    # Test Unitful numbers
+    for f in (sum, mean, minimum, maximum, var, std, first, last, length)
+        gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
+        indices && @test gd.idx !== nothing # Trigger computation of indices
+
+        res = combine(gd, :x3 => f => :y)
+        expected = combine(gd, :x3 => (x -> f(x)) => :y)
+        @test res ≅ expected
+        @test typeof(res.y) == typeof(expected.y)
+
+        f === length && continue
+
+        df.x3m = allowmissing(df.x1)
+        df.x3m[1] = missing
+        gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
+        indices && @test gd.idx !== nothing # Trigger computation of indices
+        res = combine(gd, :x3m => f => :y)
+        expected = combine(gd, :x3m => (x -> f(x)) => :y)
+        @test res ≅ expected
+        @test typeof(res.y) == typeof(expected.y)
+        res = combine(gd, :x3m => f∘skipmissing => :y)
+        expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
+        @test res ≅ expected
+        @test typeof(res.y) == typeof(expected.y)
+
+        # Test reduction over group with only missing values
+        gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
+        indices && @test gd.idx !== nothing # Trigger computation of indices
+        gd[1][:, :x3m] .= missing
+        if f in (maximum, minimum, first, last)
+            @test_throws ArgumentError combine(gd, :x3m => f∘skipmissing => :y)
+        else
+            res = combine(gd, :x3m => f∘skipmissing => :y)
+            expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
+            @test res ≅ expected
+            @test typeof(res.y) == typeof(expected.y)
+        end
     end
     # Test CategoricalArray
     for f in (maximum, minimum, first, last, length),
         (T, m) in ((Int, false),
                    (Union{Missing, Int}, false), (Union{Missing, Int}, true))
-        df.x3 = CategoricalVector{T}(df.x1)
-        m && (df.x3[1] = missing)
+        df.x1c = CategoricalVector{T}(df.x1)
+        m && (df.x1c[1] = missing)
         gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
         indices && @test gd.idx !== nothing # Trigger computation of indices
-        res = combine(gd, :x3 => f => :y)
-        expected = combine(gd, :x3 => (x -> f(x)) => :y)
+        res = combine(gd, :x1c => f => :y)
+        expected = combine(gd, :x1c => (x -> f(x)) => :y)
         @test res ≅ expected
         @test typeof(res.y) == typeof(expected.y)
 
         f === length && continue
 
-        res = combine(gd, :x3 => f∘skipmissing => :y)
-        expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
+        res = combine(gd, :x1c => f∘skipmissing => :y)
+        expected = combine(gd, :x1c => (x -> f(collect(skipmissing(x)))) => :y)
         @test res ≅ expected
         @test typeof(res.y) == typeof(expected.y)
         if m
-            gd[1][:, :x3] .= missing
-            @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y)
+            gd[1][:, :x1c] .= missing
+            @test_throws ArgumentError combine(gd, :x1c => f∘skipmissing => :y)
         end
     end
     @test combine(gd, :x1 => maximum => :y, :x2 => sum => :z) ≅