Skip to content

Commit

Permalink
Fix groupreduce with var and std for Unitful types (#2601)
Browse files Browse the repository at this point in the history
For `Number`s for which squaring changes the type, we need to use different
types for the mean, the variance and the standard deviation.
  • Loading branch information
nalimilan authored and bkamins committed Jan 21, 2021
1 parent a6910c5 commit 89b4b89
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 37 deletions.
10 changes: 6 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DataFrames"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "0.22.2"
version = "0.22.3"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -24,7 +24,6 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[compat]
julia = "1"
CategoricalArrays = "0.8.3, 0.9"
Compat = "3.17"
DataAPI = "1.4"
Expand All @@ -33,10 +32,12 @@ IteratorInterfaceExtensions = "0.1.1, 1"
Missings = "0.4.2"
PooledArrays = "0.5"
PrettyTables = "0.10"
Reexport = "0.1, 0.2"
Reexport = "0.1, 0.2, 1.0"
SortingAlgorithms = "0.1, 0.2, 0.3"
TableTraits = "0.4, 1"
Tables = "1.2"
Unitful = "1"
julia = "1"

[extras]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Expand All @@ -45,6 +46,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"

[targets]
test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]
test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test", "Unitful"]
17 changes: 8 additions & 9 deletions src/groupeddataframe/fastaggregates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -238,24 +238,23 @@ end

function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame)
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd)
z = zero(eltype(incol)) - zero(eltype(means))
S = typeof((abs2(z) + abs2(z))/2)
# !ismissing check is purely an optimization to avoid a copy later
if eltype(means) >: Missing && agg.condf !== !ismissing
T = Union{Missing, real(eltype(means))}
else
T = real(eltype(means))
end
T = eltype(incol) >: Missing && agg.condf !== !ismissing ?
T = Union{Missing, S} : S
res = zeros(T, length(gd))
return groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf,
(x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1),
(x, l) -> l <= 1 ? x/0 : x/(l-1),
false, incol, gd)
end

function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame)
outcol = Aggregate(var, agg.condf)(incol, gd)
if eltype(outcol) <: Union{Missing, Rational}
return sqrt.(outcol)
else
if typeof(sqrt(zero(eltype(outcol)))) === eltype(outcol)
return map!(sqrt, outcol, outcol)
else
return map(sqrt, outcol)
end
end

Expand Down
118 changes: 94 additions & 24 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module TestGrouping

using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI
using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI,
Unitful
const = isequal

"""Check if passed data frames are `isequal` and have the same element types of columns"""
Expand Down Expand Up @@ -904,8 +905,11 @@ Base.isless(::TestType, ::TestType) = false
@testset "combine with aggregation functions (skipmissing=$skip, sort=$sort, indices=$indices)" for
skip in (false, true), sort in (false, true), indices in (false, true)
Random.seed!(1)
df = DataFrame(a = rand([1:5;missing], 20), x1 = rand(1:100, 20),
x2 = rand(1:100, 20) +im*rand(1:100, 20))
# 5 is there to ensure we test a single-row group
df = DataFrame(a = [rand([1:4;missing], 19); 5],
x1 = rand(1:100, 20),
x2 = rand(1:100, 20) + im*rand(1:100, 20),
x3 = rand(1:100, 20) .* u"m")

for f in (sum, prod, maximum, minimum, mean, var, std, first, last, length)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
Expand All @@ -918,39 +922,39 @@ Base.isless(::TestType, ::TestType) = false

for T in (Union{Missing, Int}, Union{Int, Int8},
Union{Missing, Int, Int8})
df.x3 = Vector{T}(df.x1)
df.x1u = Vector{T}(df.x1)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1u => f => :y)
expected = combine(gd, :x1u => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end

f === length && continue

df.x3 = allowmissing(df.x1)
df.x3[1] = missing
df.x1m = allowmissing(df.x1)
df.x1m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1m => f => :y)
expected = combine(gd, :x1m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1m => fskipmissing => :y)
expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x3] .= missing
gd[1][:, :x1m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x3 => fskipmissing => :y)
@test_throws ArgumentError combine(gd, :x1m => fskipmissing => :y)
else
res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1m => fskipmissing => :y)
expected = combine(gd, :x1m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
Expand All @@ -964,29 +968,95 @@ Base.isless(::TestType, ::TestType) = false
expected = combine(gd, :x2 => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

df.x2m = allowmissing(df.x1)
df.x2m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x2m => f => :y)
expected = combine(gd, :x2m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x2m => fskipmissing => :y)
expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x2m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x2m => fskipmissing => :y)
else
res = combine(gd, :x2m => fskipmissing => :y)
expected = combine(gd, :x2m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
end
# Test Unitful numbers
for f in (sum, mean, minimum, maximum, var, std, first, last, length)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices

res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

df.x3m = allowmissing(df.x1)
df.x3m[1] = missing
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3m => f => :y)
expected = combine(gd, :x3m => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
res = combine(gd, :x3m => fskipmissing => :y)
expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x3m] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, :x3m => fskipmissing => :y)
else
res = combine(gd, :x3m => fskipmissing => :y)
expected = combine(gd, :x3m => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
end
# Test CategoricalArray
for f in (maximum, minimum, first, last, length),
(T, m) in ((Int, false),
(Union{Missing, Int}, false), (Union{Missing, Int}, true))
df.x3 = CategoricalVector{T}(df.x1)
m && (df.x3[1] = missing)
df.x1c = CategoricalVector{T}(df.x1)
m && (df.x1c[1] = missing)
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
res = combine(gd, :x3 => f => :y)
expected = combine(gd, :x3 => (x -> f(x)) => :y)
res = combine(gd, :x1c => f => :y)
expected = combine(gd, :x1c => (x -> f(x)) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)

f === length && continue

res = combine(gd, :x3 => fskipmissing => :y)
expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y)
res = combine(gd, :x1c => fskipmissing => :y)
expected = combine(gd, :x1c => (x -> f(collect(skipmissing(x)))) => :y)
@test res expected
@test typeof(res.y) == typeof(expected.y)
if m
gd[1][:, :x3] .= missing
@test_throws ArgumentError combine(gd, :x3 => fskipmissing => :y)
gd[1][:, :x1c] .= missing
@test_throws ArgumentError combine(gd, :x1c => fskipmissing => :y)
end
end
@test combine(gd, :x1 => maximum => :y, :x2 => sum => :z)
Expand Down

2 comments on commit 89b4b89

@bkamins
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register branch=0.22_patches

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/28432

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.22.3 -m "<description of version>" 89b4b89d8f8f4bd526a52645c523b23bf7367a19
git push origin v0.22.3

Please sign in to comment.