From 641eaa37adb2527947bf3249d8c92426a6f12e87 Mon Sep 17 00:00:00 2001 From: Alec Loudenback Date: Thu, 16 Mar 2023 21:36:56 -0500 Subject: [PATCH 1/4] add `:sum` to `describe` --- src/abstractdataframe/abstractdataframe.jl | 10 +++++++--- test/dataframe.jl | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7b7779e81..64e2b7bdc 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -577,7 +577,7 @@ where each row represents a variable and each column a summary statistic. - `stats::Union{Symbol, Pair}...` : the summary statistics to report. Arguments can be: - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, - `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:nuniqueall`, `:first`, + `:median`, `:q75`, `:max`, `:sum`, `:eltype`, `:nunique`, `:nuniqueall`, `:first`, `:last`, `:nnonmissing`, and `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, `:max`, `:nmissing`, and `:eltype`. - `:detailed` as the only `Symbol` argument to return all statistics @@ -664,7 +664,7 @@ DataAPI.describe(df::AbstractDataFrame; cols=:) = function _describe(df::AbstractDataFrame, stats::AbstractVector) predefined_funs = Symbol[s for s in stats if s isa Symbol] - allowed_fields = [:mean, :std, :min, :q25, :median, :q75, :max, + allowed_fields = [:mean, :std, :min, :q25, :median, :q75, :max, :sum, :nunique, :nuniqueall, :nmissing, :nnonmissing, :first, :last, :eltype] @@ -674,7 +674,7 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector) splice!(stats, i, allowed_fields) # insert in the stats vector to get a good order elseif predefined_funs == [:detailed] predefined_funs = [:mean, :std, :min, :q25, :median, :q75, - :max, :nunique, :nmissing, :eltype] + :max, :sum, :nunique, :nmissing, :eltype] i = findfirst(s -> s == :detailed, stats) splice!(stats, i, predefined_funs) # insert in the stats vector to get a good order elseif :all in predefined_funs || :detailed in predefined_funs @@ -799,6 +799,10 @@ function get_stats(@nospecialize(col::Union{AbstractVector, Base.SkipMissing}), d[:nuniqueall] = try length(Set(col)) catch end end + if :sum in stats + d[:sum] = try sum(col) catch end + end + return d end diff --git a/test/dataframe.jl b/test/dataframe.jl index d9cce7682..5677834d8 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -548,6 +548,7 @@ end median=[2.5, 2.0, nothing, nothing, VERSION >= v"1.7.0-beta1.2" ? Date(2002) : nothing, nothing], q75=[3.25, 2.5, nothing, nothing, nothing, nothing], max=[4.0, 3.0, "d", "c", Date(2004), 2], + sum=[10, 6, nothing, nothing, nothing, nothing], nunique=[nothing, nothing, 4, 3, 4, 2], nuniqueall=[4, 3, 4, 3, 4, 2], nmissing=[0, 1, 0, 1, 0, 0], @@ -570,7 +571,7 @@ end # Test that it works with :detailed @test describe_output[:, [:variable, :mean, :std, :min, :q25, :median, :q75, - :max, :nunique, :nmissing, :eltype]] ≅ + :max, :sum, :nunique, :nmissing, :eltype]] ≅ describe(df, :detailed) # Test that it works on a custom function From 0d2692e962e40069a042d6b0d6c66da4fa52e261 Mon Sep 17 00:00:00 2001 From: Alec Loudenback Date: Thu, 27 Apr 2023 14:55:10 -0500 Subject: [PATCH 2/4] address comments --- src/abstractdataframe/abstractdataframe.jl | 4 ++-- test/dataframe.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 64e2b7bdc..6b544e602 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -581,7 +581,7 @@ where each row represents a variable and each column a summary statistic. `:last`, `:nnonmissing`, and `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, `:max`, `:nmissing`, and `:eltype`. - `:detailed` as the only `Symbol` argument to return all statistics - except `:first`, `:last`, `:nuniqueall`, and `:nnonmissing`. + except `:first`, `:last`, `:sum`, `:nuniqueall`, and `:nnonmissing`. - `:all` as the only `Symbol` argument to return all statistics. - A `function => name` pair where `name` is a `Symbol` or string. This will create a column of summary statistics with the provided name. @@ -674,7 +674,7 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector) splice!(stats, i, allowed_fields) # insert in the stats vector to get a good order elseif predefined_funs == [:detailed] predefined_funs = [:mean, :std, :min, :q25, :median, :q75, - :max, :sum, :nunique, :nmissing, :eltype] + :max, :nunique, :nmissing, :eltype] i = findfirst(s -> s == :detailed, stats) splice!(stats, i, predefined_funs) # insert in the stats vector to get a good order elseif :all in predefined_funs || :detailed in predefined_funs diff --git a/test/dataframe.jl b/test/dataframe.jl index 5677834d8..f46c8e6a1 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -571,7 +571,7 @@ end # Test that it works with :detailed @test describe_output[:, [:variable, :mean, :std, :min, :q25, :median, :q75, - :max, :sum, :nunique, :nmissing, :eltype]] ≅ + :max, :nunique, :nmissing, :eltype]] ≅ describe(df, :detailed) # Test that it works on a custom function From b7b526c7c45ffcab120440f2eccf31b11a450a0f Mon Sep 17 00:00:00 2001 From: Alec Loudenback Date: Thu, 27 Apr 2023 14:59:31 -0500 Subject: [PATCH 3/4] add NEWS.md entry for :sum --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index f41fb8f30..145db2e03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ treated as if they were wrapped in `Cols` and does not throw an error when a vector of duplicate indices is passed when doing column selection ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) +* `describe` now has `:sum` available as a descriptive statistic. + ([#3303](https://github.com/JuliaData/DataFrames.jl/pull/3303)) # DataFrames.jl v1.5 Release Notes From 829ae8b60f0ec6bd517a9e1489bbc68be4351943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 28 Apr 2023 16:11:28 +0200 Subject: [PATCH 4/4] Update dataframe.jl --- test/dataframe.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dataframe.jl b/test/dataframe.jl index f46c8e6a1..a8c7f092e 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -595,8 +595,8 @@ end @test describe(df, cols=Not(1)) ≅ describe(select(df, Not(1))) @test describe(df, cols=Not("a")) ≅ describe(select(df, Not(1))) - @test describe(DataFrame(a=[1, 2]), cols=:a, :min, minimum => :min2, maximum => "max2", :max) == - DataFrame(variable=:a, min=1, min2=1, max2=2, max=2) + @test describe(DataFrame(a=[1, 2]), cols=:a, :min, minimum => :min2, maximum => "max2", :max, :sum) == + DataFrame(variable=:a, min=1, min2=1, max2=2, max=2, sum=3) @test_throws ArgumentError describe(df, :mean, :all) @test_throws MethodError describe(DataFrame(a=[1, 2]), cols=:a, "max2" => maximum)