diff --git a/NEWS.md b/NEWS.md index f41fb8f30..145db2e03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ treated as if they were wrapped in `Cols` and does not throw an error when a vector of duplicate indices is passed when doing column selection ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) +* `describe` now has `:sum` available as a descriptive statistic. + ([#3303](https://github.com/JuliaData/DataFrames.jl/pull/3303)) # DataFrames.jl v1.5 Release Notes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7b7779e81..6b544e602 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -577,11 +577,11 @@ where each row represents a variable and each column a summary statistic. - `stats::Union{Symbol, Pair}...` : the summary statistics to report. Arguments can be: - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, - `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:nuniqueall`, `:first`, + `:median`, `:q75`, `:max`, `:sum`, `:eltype`, `:nunique`, `:nuniqueall`, `:first`, `:last`, `:nnonmissing`, and `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, `:max`, `:nmissing`, and `:eltype`. - `:detailed` as the only `Symbol` argument to return all statistics - except `:first`, `:last`, `:nuniqueall`, and `:nnonmissing`. + except `:first`, `:last`, `:sum`, `:nuniqueall`, and `:nnonmissing`. - `:all` as the only `Symbol` argument to return all statistics. - A `function => name` pair where `name` is a `Symbol` or string. This will create a column of summary statistics with the provided name. @@ -664,7 +664,7 @@ DataAPI.describe(df::AbstractDataFrame; cols=:) = function _describe(df::AbstractDataFrame, stats::AbstractVector) predefined_funs = Symbol[s for s in stats if s isa Symbol] - allowed_fields = [:mean, :std, :min, :q25, :median, :q75, :max, + allowed_fields = [:mean, :std, :min, :q25, :median, :q75, :max, :sum, :nunique, :nuniqueall, :nmissing, :nnonmissing, :first, :last, :eltype] @@ -799,6 +799,10 @@ function get_stats(@nospecialize(col::Union{AbstractVector, Base.SkipMissing}), d[:nuniqueall] = try length(Set(col)) catch end end + if :sum in stats + d[:sum] = try sum(col) catch end + end + return d end diff --git a/test/dataframe.jl b/test/dataframe.jl index d9cce7682..a8c7f092e 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -548,6 +548,7 @@ end median=[2.5, 2.0, nothing, nothing, VERSION >= v"1.7.0-beta1.2" ? Date(2002) : nothing, nothing], q75=[3.25, 2.5, nothing, nothing, nothing, nothing], max=[4.0, 3.0, "d", "c", Date(2004), 2], + sum=[10, 6, nothing, nothing, nothing, nothing], nunique=[nothing, nothing, 4, 3, 4, 2], nuniqueall=[4, 3, 4, 3, 4, 2], nmissing=[0, 1, 0, 1, 0, 0], @@ -594,8 +595,8 @@ end @test describe(df, cols=Not(1)) ≅ describe(select(df, Not(1))) @test describe(df, cols=Not("a")) ≅ describe(select(df, Not(1))) - @test describe(DataFrame(a=[1, 2]), cols=:a, :min, minimum => :min2, maximum => "max2", :max) == - DataFrame(variable=:a, min=1, min2=1, max2=2, max=2) + @test describe(DataFrame(a=[1, 2]), cols=:a, :min, minimum => :min2, maximum => "max2", :max, :sum) == + DataFrame(variable=:a, min=1, min2=1, max2=2, max=2, sum=3) @test_throws ArgumentError describe(df, :mean, :all) @test_throws MethodError describe(DataFrame(a=[1, 2]), cols=:a, "max2" => maximum)