From b7fe44cee82194f13196f2314978a57541bd99f2 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 9 Apr 2019 18:12:53 +0200 Subject: [PATCH] Accept any iterable in scalar stats (#472) --- src/moments.jl | 68 +++++++++-------- src/scalarstats.jl | 178 ++++++++++++++++++++++++++++---------------- test/scalarstats.jl | 34 ++++++++- 3 files changed, 180 insertions(+), 100 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 1d6af243af1e3b..2a3c3bffe637d9 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -2,7 +2,7 @@ ## var """ - varm(x, w::AbstractWeights, m, [dim]; corrected=false) + varm(x::AbstractArray, w::AbstractWeights, m, [dim]; corrected=false) Compute the variance of a real-valued array `x` with a known mean `m`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. @@ -22,7 +22,7 @@ varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = _moment2(v, w, m; corrected=depcheck(:varm, corrected)) """ - var(x, w::AbstractWeights, [dim]; mean=nothing, corrected=false) + var(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false) Compute the variance of a real-valued array `x`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. @@ -98,7 +98,7 @@ end ## std """ - stdm(v, w::AbstractWeights, m, [dim]; corrected=false) + stdm(x::AbstractArray, w::AbstractWeights, m, [dim]; corrected=false) Compute the standard deviation of a real-valued array `x` with a known mean `m`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. @@ -118,7 +118,7 @@ stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected))) """ - std(v, w::AbstractWeights, [dim]; mean=nothing, corrected=false) + std(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false) Compute the standard deviation of a real-valued array `x`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. @@ -153,66 +153,68 @@ std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing, """ mean_and_var(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, var) -Return the mean and variance of a real-valued array `x`, optionally over a dimension -`dim`, as a tuple. Observations in `x` can be weighted using weight vector `w`. +Return the mean and standard deviation of collection `x`. If `x` is an `AbstractArray`, +`dim` can be specified as a tuple to compute statistics over these dimensions. +A weighting vector `w` can be specified to weight the estimates. Finally, bias correction is be applied to the variance calculation if `corrected=true`. See [`var`](@ref) documentation for more details. """ -function mean_and_var(A::RealArray; corrected::Bool=true) - m = mean(A) - v = varm(A, m; corrected=corrected) +function mean_and_var(x; corrected::Bool=true) + m = mean(x) + v = varm(x, m; corrected=corrected) m, v end """ mean_and_std(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, std) -Return the mean and standard deviation of a real-valued array `x`, optionally -over a dimension `dim`, as a tuple. A weighting vector `w` can be specified -to weight the estimates. Finally, bias correction is applied to the +Return the mean and standard deviation of collection `x`. If `x` is an `AbstractArray`, +`dim` can be specified as a tuple to compute statistics over these dimensions. +A weighting vector `w` can be specified to weight the estimates. +Finally, bias correction is applied to the standard deviation calculation if `corrected=true`. See [`std`](@ref) documentation for more details. """ -function mean_and_std(A::RealArray; corrected::Bool=true) - m = mean(A) - s = stdm(A, m; corrected=corrected) +function mean_and_std(x; corrected::Bool=true) + m = mean(x) + s = stdm(x, m; corrected=corrected) m, s end -function mean_and_var(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing) - m = mean(A, w) - v = varm(A, w, m; corrected=depcheck(:mean_and_var, corrected)) +function mean_and_var(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing) + m = mean(x, w) + v = varm(x, w, m; corrected=depcheck(:mean_and_var, corrected)) m, v end -function mean_and_std(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing) - m = mean(A, w) - s = stdm(A, w, m; corrected=depcheck(:mean_and_std, corrected)) +function mean_and_std(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing) + m = mean(x, w) + s = stdm(x, w, m; corrected=depcheck(:mean_and_std, corrected)) m, s end -function mean_and_var(A::RealArray, dim::Int; corrected::Bool=true) - m = mean(A, dims = dim) - v = varm(A, m, dims = dim, corrected=corrected) +function mean_and_var(x::RealArray, dim::Int; corrected::Bool=true) + m = mean(x, dims = dim) + v = varm(x, m, dims = dim, corrected=corrected) m, v end -function mean_and_std(A::RealArray, dim::Int; corrected::Bool=true) - m = mean(A, dims = dim) - s = stdm(A, m, dim; corrected=corrected) +function mean_and_std(x::RealArray, dim::Int; corrected::Bool=true) + m = mean(x, dims = dim) + s = stdm(x, m, dim; corrected=corrected) m, s end -function mean_and_var(A::RealArray, w::AbstractWeights, dims::Int; +function mean_and_var(x::RealArray, w::AbstractWeights, dims::Int; corrected::DepBool=nothing) - m = mean(A, w, dims=dims) - v = varm(A, w, m, dims; corrected=depcheck(:mean_and_var, corrected)) + m = mean(x, w, dims=dims) + v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, corrected)) m, v end -function mean_and_std(A::RealArray, w::AbstractWeights, dims::Int; +function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int; corrected::DepBool=nothing) - m = mean(A, w, dims=dims) - s = stdm(A, w, m, dims; corrected=depcheck(:mean_and_std, corrected)) + m = mean(x, w, dims=dims) + s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, corrected)) m, s end diff --git a/src/scalarstats.jl b/src/scalarstats.jl index adf8ae4c296310..40a0edf34246b6 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -53,7 +53,7 @@ over a specified range `r`. If several modes exist, the first one (in order of appearance) is returned. """ function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer - isempty(a) && error("mode: input array cannot be empty.") + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) len = length(a) r0 = r[1] r1 = r[end] @@ -105,17 +105,18 @@ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer return ms end -# compute mode over arbitrary array -function mode(a::AbstractArray{T}) where T - isempty(a) && error("mode: input array cannot be empty.") - cnts = Dict{T,Int}() +# compute mode over arbitrary iterable +function mode(a) + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + cnts = Dict{eltype(a),Int}() # first element mc = 1 - mv = a[1] + mv, st = iterate(a) cnts[mv] = 1 # find the mode along with table construction - for i = 2 : length(a) - @inbounds x = a[i] + y = iterate(a, st) + while y !== nothing + x, st = y if haskey(cnts, x) c = (cnts[x] += 1) if c > mc @@ -126,19 +127,22 @@ function mode(a::AbstractArray{T}) where T cnts[x] = 1 # in this case: c = 1, and thus c > mc won't happen end + y = iterate(a, st) end return mv end -function modes(a::AbstractArray{T}) where T - isempty(a) && error("modes: input array cannot be empty.") - cnts = Dict{T,Int}() +function modes(a) + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + cnts = Dict{eltype(a),Int}() # first element mc = 1 - cnts[a[1]] = 1 + x, st = iterate(a) + cnts[x] = 1 # find the mode along with table construction - for i = 2 : length(a) - @inbounds x = a[i] + y = iterate(a, st) + while y !== nothing + x, st = y if haskey(cnts, x) c = (cnts[x] += 1) if c > mc @@ -148,15 +152,10 @@ function modes(a::AbstractArray{T}) where T cnts[x] = 1 # in this case: c = 1, and thus c > mc won't happen end + y = iterate(a, st) end # find values corresponding to maximum counts - ms = T[] - for (x, c) in cnts - if c == mc - push!(ms, x) - end - end - return ms + return [x for (x, c) in cnts if c == mc] end @@ -167,22 +166,22 @@ end ############################# """ - percentile(v, p) + percentile(x, p) -Return the `p`th percentile of a real-valued array `v`, i.e. `quantile(x, p / 100)`. +Return the `p`th percentile of a collection `x`, i.e. `quantile(x, p / 100)`. """ -percentile(v::AbstractArray{<:Real}, p) = quantile(v, p * 0.01) +percentile(x, p) = quantile(x, p * 0.01) """ - nquantile(v, n) + nquantile(x, n::Integer) -Return the n-quantiles of a real-valued array, i.e. the values which +Return the n-quantiles of collection `x`, i.e. the values which partition `v` into `n` subsets of nearly equal size. -Equivalent to `quantile(v, [0:n]/n)`. For example, `nquantiles(x, 5)` +Equivalent to `quantile(x, [0:n]/n)`. For example, `nquantiles(x, 5)` returns a vector of quantiles, respectively at `[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`. """ -nquantile(v::AbstractArray{<:Real}, n::Integer) = quantile(v, (0:n)/n) +nquantile(x, n::Integer) = quantile(x, (0:n)/n) ############################# @@ -195,75 +194,128 @@ nquantile(v::AbstractArray{<:Real}, n::Integer) = quantile(v, (0:n)/n) """ span(x) -Return the span of an integer array, i.e. the range `minimum(x):maximum(x)`. -The minimum and maximum of `x` are computed in one-pass using `extrema`. +Return the span of a collection, i.e. the range `minimum(x):maximum(x)`. +The minimum and maximum of `x` are computed in one pass using `extrema`. """ -span(x::AbstractArray{<:Integer}) = ((a, b) = extrema(x); a:b) +span(x) = ((a, b) = extrema(x); a:b) # Variation coefficient: std / mean """ variation(x, m=mean(x)) -Return the coefficient of variation of an array `x`, optionally specifying +Return the coefficient of variation of collection `x`, optionally specifying a precomputed mean `m`. The coefficient of variation is the ratio of the standard deviation to the mean. """ -variation(x::AbstractArray{<:Real}, m::Real) = stdm(x, m) / m -variation(x::AbstractArray{<:Real}) = variation(x, mean(x)) +variation(x, m) = stdm(x, m) / m +variation(x) = ((m, s) = mean_and_std(x); s/m) + +# Standard error of the mean: std / sqrt(len) +# Code taken from var in the Statistics stdlib module + +# faster computation of real(conj(x)*y) +realXcY(x::Real, y::Real) = x*y +realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) -# Standard error of the mean: std(a) / sqrt(len) """ - sem(a) + sem(x) -Return the standard error of the mean of `a`, i.e. `sqrt(var(a) / length(a))`. +Return the standard error of the mean of collection `x`, +i.e. `sqrt(var(x, corrected=true) / length(x))`. """ -sem(a::AbstractArray{<:Real}) = sqrt(var(a) / length(a)) +function sem(x) + y = iterate(x) + if y === nothing + T = eltype(x) + # Return the NaN of the type that we would get, had this collection + # contained any elements (this is consistent with std) + return oftype(sqrt((abs2(zero(T)) + abs2(zero(T)))/2), NaN) + end + count = 1 + value, state = y + y = iterate(x, state) + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + M = value / 1 + S = real(zero(M)) + while y !== nothing + value, state = y + y = iterate(x, state) + count += 1 + new_M = M + (value - M) / count + S = S + realXcY(value - M, value - new_M) + M = new_M + end + var = S / (count - 1) + return sqrt(var/count) +end # Median absolute deviation +@irrational mad_constant 1.4826022185056018 BigFloat("1.482602218505601860547076529360423431326703202590312896536266275245674447622701") + """ - mad(v; center=median(v), normalize=true) + mad(x; center=median(x), normalize=true) -Compute the median absolute deviation (MAD) of `v` around `center` +Compute the median absolute deviation (MAD) of collection `x` around `center` (by default, around the median). If `normalize` is set to `true`, the MAD is multiplied by `1 / quantile(Normal(), 3/4) ≈ 1.4826`, in order to obtain a consistent estimator of the standard deviation under the assumption that the data is normally distributed. """ -function mad(v::AbstractArray{T}; - center::Union{Real,Nothing}=nothing, - normalize::Union{Bool, Nothing}=nothing) where T<:Real - isempty(v) && throw(ArgumentError("mad is not defined for empty arrays")) - - S = promote_type(T, typeof(middle(first(v)))) - v2 = LinearAlgebra.copy_oftype(v, S) - +function mad(x; center=nothing, normalize::Union{Bool, Nothing}=nothing, constant=nothing) if normalize === nothing Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad) normalize = true end - mad!(v2, center=center === nothing ? median!(v2) : center, normalize=normalize) + isempty(x) && throw(ArgumentError("mad is not defined for empty arrays")) + T = eltype(x) + # Knowing the eltype allows allocating a single array able to hold both original values + # and differences from the center, instead of two arrays + S = isconcretetype(T) ? promote_type(T, typeof(middle(zero(T)))) : T + x2 = x isa AbstractArray ? LinearAlgebra.copy_oftype(x, S) : collect(S, x) + c = center === nothing ? median!(x2) : center + if isconcretetype(T) + x2 .= abs.(x2 .- c) + else + x2 = abs.(x2 .- c) + end + m = median!(x2) + if normalize isa Nothing + Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad) + normalize = true + end + if !isa(constant, Nothing) + Base.depwarn("keyword argument `constant` is deprecated, use `normalize` instead or apply the multiplication directly", :mad) + m * constant + elseif normalize + m * mad_constant + else + m + end end -@irrational mad_constant 1.4826022185056018 BigFloat("1.482602218505601860547076529360423431326703202590312896536266275245674447622701") """ - StatsBase.mad!(v; center=median!(v), normalize=true) + StatsBase.mad!(x; center=median!(x), normalize=true) -Compute the median absolute deviation (MAD) of `v` around `center` -(by default, around the median), overwriting `v` in the process. +Compute the median absolute deviation (MAD) of array `x` around `center` +(by default, around the median), overwriting `x` in the process. +`x` must be able to hold values of generated by calling `middle` on its elements +(for example an integer vector is not appropriate since `middle` can produce +non-integer values). If `normalize` is set to `true`, the MAD is multiplied by `1 / quantile(Normal(), 3/4) ≈ 1.4826`, in order to obtain a consistent estimator of the standard deviation under the assumption that the data is normally distributed. """ -function mad!(v::AbstractArray{<:Real}; - center::Real=median!(v), +function mad!(x::AbstractArray; + center=median!(x), normalize::Union{Bool,Nothing}=true, constant=nothing) - isempty(v) && throw(ArgumentError("mad is not defined for empty arrays")) - v .= abs.(v .- center) - m = median!(v) + isempty(x) && throw(ArgumentError("mad is not defined for empty arrays")) + x .= abs.(x .- center) + m = median!(x) if normalize isa Nothing Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad) normalize = true @@ -280,19 +332,19 @@ end # Interquartile range """ - iqr(v) + iqr(x) -Compute the interquartile range (IQR) of an array, i.e. the 75th percentile +Compute the interquartile range (IQR) of collection `x`, i.e. the 75th percentile minus the 25th percentile. """ -iqr(v::AbstractArray{<:Real}) = (q = quantile(v, [.25, .75]); q[2] - q[1]) +iqr(x) = (q = quantile(x, [.25, .75]); q[2] - q[1]) # Generalized variance """ genvar(X) Compute the generalized sample variance of `X`. If `X` is a vector, one-column matrix, -or other one-dimensional iterable, this is equivalent to the sample variance. +or other iterable, this is equivalent to the sample variance. Otherwise if `X` is a matrix, this is equivalent to the determinant of the covariance matrix of `X`. @@ -308,7 +360,7 @@ genvar(itr) = var(itr) totalvar(X) Compute the total sample variance of `X`. If `X` is a vector, one-column matrix, -or other one-dimensional iterable, this is equivalent to the sample variance. +or other iterable, this is equivalent to the sample variance. Otherwise if `X` is a matrix, this is equivalent to the sum of the diagonal elements of the covariance matrix of `X`. """ diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 74eb9658d51908..4418f0fc31006c 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -39,6 +39,15 @@ using DelimitedFiles @test modes([1, 2, 3, 3, 2, 2, 1]) == [2] @test sort(modes([1, 3, 2, 3, 3, 2, 2, 1])) == [2, 3] +@test mode(skipmissing([1, missing, missing, 3, 2, 2, missing])) == 2 +@test modes(skipmissing([1, missing, missing, 3, 2, 2, missing])) == [2] +@test sort(modes(skipmissing([1, missing, 3, 3, 2, 2, missing]))) == [2, 3] + +@test_throws ArgumentError mode(Int[]) +@test_throws ArgumentError modes(Int[]) +@test_throws ArgumentError mode(Any[]) +@test_throws ArgumentError modes(Any[]) + ## zscores @test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;] @@ -67,26 +76,43 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test nquantile(1:5, 2) ≈ [1, 3, 5] @test nquantile(1:5, 4) ≈ [1:5;] +@test nquantile(skipmissing([missing, 2, 5, missing]), 2) ≈ [2.0, 3.5, 5.0] @test percentile([1:5;], 25) ≈ 2.0 @test percentile([1:5;], [25, 50, 75]) ≈ [2.0, 3.0, 4.0] +@test percentile(skipmissing([missing, 2, 5, missing]), 25) ≈ 2.75 +@test percentile(skipmissing([missing, 2, 5, missing]), [25, 50, 75]) ≈ [2.75, 3.5, 4.25] ##### Dispersion @test span([3, 4, 5, 6, 2]) == (2:6) +@test span(skipmissing([1, missing, 5, missing])) == 1:5 @test variation([1:5;]) ≈ 0.527046276694730 +@test variation(skipmissing([missing; 1:5; missing])) ≈ 0.527046276694730 @test sem([1:5;]) ≈ 0.707106781186548 - -@test StatsBase.mad(1:5; center=3, normalize=true) ≈ 1.4826022185056018 +@test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548 +@test sem(Int[]) === NaN +@test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN +@test_throws MethodError sem(Any[]) +@test_throws MethodError sem(skipmissing([missing])) + +@test mad(1:5; center=3, normalize=true) ≈ 1.4826022185056018 +@test mad(skipmissing([missing; 1:5; missing]); center=3, normalize=true) ≈ 1.4826022185056018 @test StatsBase.mad!([1:5;]; center=3, normalize=true) ≈ 1.4826022185056018 @test mad(1:5, normalize=true) ≈ 1.4826022185056018 -@test StatsBase.mad(1:5, normalize=false) ≈ 1.0 +@test mad(1:5, normalize=false) ≈ 1.0 +@test mad(skipmissing([missing; 1:5; missing]), normalize=true) ≈ 1.4826022185056018 +@test mad(skipmissing([missing; 1:5; missing]), normalize=false) ≈ 1.0 @test StatsBase.mad!([1:5;], normalize=false) ≈ 1.0 -@test StatsBase.mad(1:5, center=3, normalize=false) ≈ 1.0 +@test mad(1:5, center=3, normalize=false) ≈ 1.0 +@test mad(skipmissing([missing; 1:5; missing]), center=3, normalize=false) ≈ 1.0 @test StatsBase.mad!([1:5;], center=3, normalize=false) ≈ 1.0 +@test mad((x for x in (1, 2.1)), normalize=false) ≈ 0.55 +@test mad(Any[1, 2.1], normalize=false) ≈ 0.55 +@test mad(Union{Int,Missing}[1, 2], normalize=false) ≈ 0.5 @test_throws ArgumentError mad(Int[]) # Issue 197