Skip to content

Commit

Permalink
Fix groupreduce with var and std for Unitful types (#2601)
Browse files Browse the repository at this point in the history
For `Number`s for which squaring changes the type, we need to use different
types for the mean, the variance and the standard deviation.
  • Loading branch information
nalimilan authored Jan 21, 2021
1 parent cf4f3ab commit 34e53e0
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 35 deletions.
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "0.22.1"
version = "0.22.3"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down Expand Up @@ -36,6 +36,7 @@ Reexport = "0.1, 0.2, 1.0"
SortingAlgorithms = "0.1, 0.2, 0.3"
TableTraits = "0.4, 1"
Tables = "1.2"
Unitful = "1"
julia = "1"

[extras]
Expand All @@ -46,6 +47,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"

[targets]
test = ["Combinatorics", "DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]
test = ["Combinatorics", "DataStructures", "DataValues", "Dates", "Logging", "Random", "Test", "Unitful"]
17 changes: 8 additions & 9 deletions src/groupeddataframe/fastaggregates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -251,24 +251,23 @@ end

function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame)
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd)
z = zero(eltype(incol)) - zero(eltype(means))
S = typeof((abs2(z) + abs2(z))/2)
# !ismissing check is purely an optimization to avoid a copy later
if eltype(means) >: Missing && agg.condf !== !ismissing
T = Union{Missing, real(eltype(means))}
else
T = real(eltype(means))
end
T = eltype(incol) >: Missing && agg.condf !== !ismissing ?
T = Union{Missing, S} : S
res = zeros(T, length(gd))
return groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf,
(x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1),
(x, l) -> l <= 1 ? x/0 : x/(l-1),
false, incol, gd)
end

function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame)
outcol = Aggregate(var, agg.condf)(incol, gd)
if eltype(outcol) <: Union{Missing, Rational}
return sqrt.(outcol)
else
if typeof(sqrt(zero(eltype(outcol)))) === eltype(outcol)
return map!(sqrt, outcol, outcol)
else
return map(sqrt, outcol)
end
end

Expand Down
117 changes: 93 additions & 24 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module TestGrouping

using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI,
Combinatorics
Combinatorics, Unitful
const = isequal

"""Check if passed data frames are `isequal` and have the same element types of columns"""
Expand Down Expand Up @@ -905,8 +905,11 @@ Base.isless(::TestType, ::TestType) = false
@testset "combine with aggregation functions (skipmissing=$skip, sort=$sort, indices=$indices)" for
skip in (false, true), sort in (false, true), indices in (false, true)
Random.seed!(1)
df = DataFrame(a = rand([1:5;missing], 20), x1 = rand(1:100, 20),
x2 = rand(1:100, 20) +im*rand(1:100, 20))
# 5 is there to ensure we test a single-row group
df = DataFrame(a = [rand([1:4;missing], 19); 5],
x1 = rand(1:100, 20),
x2 = rand(1:100, 20) + im*rand(1:100, 20),
x3 = rand(1:100, 20) .* u"m")

for f in (sum, prod, maximum, minimum, mean, var, std, first, last, length)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
Expand All @@ -919,39 +922,39 @@ Base.isless(::TestType, ::TestType) = false

for T in (Union{Missing, Int}, Union{Int, Int8},
Union{Missing, Int, Int8})
df.x3 = Vector{T}(df.x1)
df.x1u = Vector{T}(df.x1)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1u => f => :y)
expected = combine(gd, :x1u => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end

f === length && continue

df.x3 = allowmissing(df.x1)
df.x3[1] = missing
df.x1m = allowmissing(df.x1)
df.x1m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1m => f => :y)
expected = combine(gd, :x1m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1m => fskipmissing => :y)
expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x3] .= missing
gd[1][:, :x1m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x3 => fskipmissing => :y)
@test_throws ArgumentError combine(gd, :x1m => fskipmissing => :y)
else
res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1m => fskipmissing => :y)
expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
Expand All @@ -965,29 +968,95 @@ Base.isless(::TestType, ::TestType) = false
expected = combine(gd, :x2 => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

df.x2m = allowmissing(df.x1)
df.x2m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x2m => f => :y)
expected = combine(gd, :x2m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x2m => fskipmissing => :y)
expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x2m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x2m => fskipmissing => :y)
else
res = combine(gd, :x2m => fskipmissing => :y)
expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
end
# Test Unitful numbers
for f in (sum, mean, minimum, maximum, var, std, first, last, length)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices

res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

df.x3m = allowmissing(df.x1)
df.x3m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3m => f => :y)
expected = combine(gd, :x3m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x3m => fskipmissing => :y)
expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x3m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x3m => fskipmissing => :y)
else
res = combine(gd, :x3m => fskipmissing => :y)
expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
end
# Test CategoricalArray
for f in (maximum, minimum, first, last, length),
(T, m) in ((Int, false),
(Union{Missing, Int}, false), (Union{Missing, Int}, true))
df.x3 = CategoricalVector{T}(df.x1)
m && (df.x3[1] = missing)
df.x1c = CategoricalVector{T}(df.x1)
m && (df.x1c[1] = missing)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1c => f => :y)
expected = combine(gd, :x1c => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1c => fskipmissing => :y)
expected = combine(gd, :x1c => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
if m
gd[1][:, :x3] .= missing
@test_throws ArgumentError combine(gd, :x3 => fskipmissing => :y)
gd[1][:, :x1c] .= missing
@test_throws ArgumentError combine(gd, :x1c => fskipmissing => :y)
end
end
@test combine(gd, :x1 => maximum => :y, :x2 => sum => :z)
Expand Down

0 comments on commit 34e53e0

Please sign in to comment.