From b7fe44cee82194f13196f2314978a57541bd99f2 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Tue, 9 Apr 2019 18:12:53 +0200
Subject: [PATCH] Accept any iterable in scalar stats (#472)

---
 src/moments.jl      |  68 +++++++++--------
 src/scalarstats.jl  | 178 ++++++++++++++++++++++++++++----------------
 test/scalarstats.jl |  34 ++++++++-
 3 files changed, 180 insertions(+), 100 deletions(-)

diff --git a/src/moments.jl b/src/moments.jl
index 1d6af243af1e3b..2a3c3bffe637d9 100644
--- a/src/moments.jl
+++ b/src/moments.jl
@@ -2,7 +2,7 @@
 
 ## var
 """
-    varm(x, w::AbstractWeights, m, [dim]; corrected=false)
+    varm(x::AbstractArray, w::AbstractWeights, m, [dim]; corrected=false)
 
 Compute the variance of a real-valued array `x` with a known mean `m`, optionally
 over a dimension `dim`. Observations in `x` are weighted using weight vector `w`.
@@ -22,7 +22,7 @@ varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
     _moment2(v, w, m; corrected=depcheck(:varm, corrected))
 
 """
-    var(x, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
+    var(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
 
 Compute the variance of a real-valued array `x`, optionally over a dimension `dim`.
 Observations in `x` are weighted using weight vector `w`.
@@ -98,7 +98,7 @@ end
 
 ## std
 """
-    stdm(v, w::AbstractWeights, m, [dim]; corrected=false)
+    stdm(x::AbstractArray, w::AbstractWeights, m, [dim]; corrected=false)
 
 Compute the standard deviation of a real-valued array `x` with a known mean `m`,
 optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`.
@@ -118,7 +118,7 @@ stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
     sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected)))
 
 """
-    std(v, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
+    std(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
 
 Compute the standard deviation of a real-valued array `x`,
 optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`.
@@ -153,66 +153,68 @@ std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing,
 """
     mean_and_var(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, var)
 
-Return the mean and variance of a real-valued array `x`, optionally over a dimension
-`dim`, as a tuple. Observations in `x` can be weighted using weight vector `w`.
+Return the mean and standard deviation of collection `x`. If `x` is an `AbstractArray`,
+`dim` can be specified as a tuple to compute statistics over these dimensions.
+A weighting vector `w` can be specified to weight the estimates.
 Finally, bias correction is be applied to the variance calculation if `corrected=true`.
 See [`var`](@ref) documentation for more details.
 """
-function mean_and_var(A::RealArray; corrected::Bool=true)
-    m = mean(A)
-    v = varm(A, m; corrected=corrected)
+function mean_and_var(x; corrected::Bool=true)
+    m = mean(x)
+    v = varm(x, m; corrected=corrected)
     m, v
 end
 
 """
     mean_and_std(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, std)
 
-Return the mean and standard deviation of a real-valued array `x`, optionally
-over a dimension `dim`, as a tuple. A weighting vector `w` can be specified
-to weight the estimates. Finally, bias correction is applied to the
+Return the mean and standard deviation of collection `x`. If `x` is an `AbstractArray`,
+`dim` can be specified as a tuple to compute statistics over these dimensions.
+A weighting vector `w` can be specified to weight the estimates.
+Finally, bias correction is applied to the
 standard deviation calculation if `corrected=true`.
 See [`std`](@ref) documentation for more details.
 """
-function mean_and_std(A::RealArray; corrected::Bool=true)
-    m = mean(A)
-    s = stdm(A, m; corrected=corrected)
+function mean_and_std(x; corrected::Bool=true)
+    m = mean(x)
+    s = stdm(x, m; corrected=corrected)
     m, s
 end
 
-function mean_and_var(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
-    m = mean(A, w)
-    v = varm(A, w, m; corrected=depcheck(:mean_and_var, corrected))
+function mean_and_var(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
+    m = mean(x, w)
+    v = varm(x, w, m; corrected=depcheck(:mean_and_var, corrected))
     m, v
 end
-function mean_and_std(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
-    m = mean(A, w)
-    s = stdm(A, w, m; corrected=depcheck(:mean_and_std, corrected))
+function mean_and_std(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
+    m = mean(x, w)
+    s = stdm(x, w, m; corrected=depcheck(:mean_and_std, corrected))
     m, s
 end
 
 
-function mean_and_var(A::RealArray, dim::Int; corrected::Bool=true)
-    m = mean(A, dims = dim)
-    v = varm(A, m, dims = dim, corrected=corrected)
+function mean_and_var(x::RealArray, dim::Int; corrected::Bool=true)
+    m = mean(x, dims = dim)
+    v = varm(x, m, dims = dim, corrected=corrected)
     m, v
 end
-function mean_and_std(A::RealArray, dim::Int; corrected::Bool=true)
-    m = mean(A, dims = dim)
-    s = stdm(A, m, dim; corrected=corrected)
+function mean_and_std(x::RealArray, dim::Int; corrected::Bool=true)
+    m = mean(x, dims = dim)
+    s = stdm(x, m, dim; corrected=corrected)
     m, s
 end
 
 
-function mean_and_var(A::RealArray, w::AbstractWeights, dims::Int;
+function mean_and_var(x::RealArray, w::AbstractWeights, dims::Int;
                       corrected::DepBool=nothing)
-    m = mean(A, w, dims=dims)
-    v = varm(A, w, m, dims; corrected=depcheck(:mean_and_var, corrected))
+    m = mean(x, w, dims=dims)
+    v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, corrected))
     m, v
 end
-function mean_and_std(A::RealArray, w::AbstractWeights, dims::Int;
+function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int;
                       corrected::DepBool=nothing)
-    m = mean(A, w, dims=dims)
-    s = stdm(A, w, m, dims; corrected=depcheck(:mean_and_std, corrected))
+    m = mean(x, w, dims=dims)
+    s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, corrected))
     m, s
 end
 
diff --git a/src/scalarstats.jl b/src/scalarstats.jl
index adf8ae4c296310..40a0edf34246b6 100644
--- a/src/scalarstats.jl
+++ b/src/scalarstats.jl
@@ -53,7 +53,7 @@ over a specified range `r`. If several modes exist, the first
 one (in order of appearance) is returned.
 """
 function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer
-    isempty(a) && error("mode: input array cannot be empty.")
+    isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
     len = length(a)
     r0 = r[1]
     r1 = r[end]
@@ -105,17 +105,18 @@ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer
     return ms
 end
 
-# compute mode over arbitrary array
-function mode(a::AbstractArray{T}) where T
-    isempty(a) && error("mode: input array cannot be empty.")
-    cnts = Dict{T,Int}()
+# compute mode over arbitrary iterable
+function mode(a)
+    isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
+    cnts = Dict{eltype(a),Int}()
     # first element
     mc = 1
-    mv = a[1]
+    mv, st = iterate(a)
     cnts[mv] = 1
     # find the mode along with table construction
-    for i = 2 : length(a)
-        @inbounds x = a[i]
+    y = iterate(a, st)
+    while y !== nothing
+        x, st = y
         if haskey(cnts, x)
             c = (cnts[x] += 1)
             if c > mc
@@ -126,19 +127,22 @@ function mode(a::AbstractArray{T}) where T
             cnts[x] = 1
             # in this case: c = 1, and thus c > mc won't happen
         end
+        y = iterate(a, st)
     end
     return mv
 end
 
-function modes(a::AbstractArray{T}) where T
-    isempty(a) && error("modes: input array cannot be empty.")
-    cnts = Dict{T,Int}()
+function modes(a)
+    isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
+    cnts = Dict{eltype(a),Int}()
     # first element
     mc = 1
-    cnts[a[1]] = 1
+    x, st = iterate(a)
+    cnts[x] = 1
     # find the mode along with table construction
-    for i = 2 : length(a)
-        @inbounds x = a[i]
+    y = iterate(a, st)
+    while y !== nothing
+        x, st = y
         if haskey(cnts, x)
             c = (cnts[x] += 1)
             if c > mc
@@ -148,15 +152,10 @@ function modes(a::AbstractArray{T}) where T
             cnts[x] = 1
             # in this case: c = 1, and thus c > mc won't happen
         end
+        y = iterate(a, st)
     end
     # find values corresponding to maximum counts
-    ms = T[]
-    for (x, c) in cnts
-        if c == mc
-            push!(ms, x)
-        end
-    end
-    return ms
+    return [x for (x, c) in cnts if c == mc]
 end
 
 
@@ -167,22 +166,22 @@ end
 #############################
 
 """
-    percentile(v, p)
+    percentile(x, p)
 
-Return the `p`th percentile of a real-valued array `v`, i.e. `quantile(x, p / 100)`.
+Return the `p`th percentile of a collection `x`, i.e. `quantile(x, p / 100)`.
 """
-percentile(v::AbstractArray{<:Real}, p) = quantile(v, p * 0.01)
+percentile(x, p) = quantile(x, p * 0.01)
 
 """
-    nquantile(v, n)
+    nquantile(x, n::Integer)
 
-Return the n-quantiles of a real-valued array, i.e. the values which
+Return the n-quantiles of collection `x`, i.e. the values which
 partition `v` into `n` subsets of nearly equal size.
 
-Equivalent to `quantile(v, [0:n]/n)`. For example, `nquantiles(x, 5)`
+Equivalent to `quantile(x, [0:n]/n)`. For example, `nquantiles(x, 5)`
 returns a vector of quantiles, respectively at `[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`.
 """
-nquantile(v::AbstractArray{<:Real}, n::Integer) = quantile(v, (0:n)/n)
+nquantile(x, n::Integer) = quantile(x, (0:n)/n)
 
 
 #############################
@@ -195,75 +194,128 @@ nquantile(v::AbstractArray{<:Real}, n::Integer) = quantile(v, (0:n)/n)
 """
     span(x)
 
-Return the span of an integer array, i.e. the range `minimum(x):maximum(x)`.
-The minimum and maximum of `x` are computed in one-pass using `extrema`.
+Return the span of a collection, i.e. the range `minimum(x):maximum(x)`.
+The minimum and maximum of `x` are computed in one pass using `extrema`.
 """
-span(x::AbstractArray{<:Integer}) = ((a, b) = extrema(x); a:b)
+span(x) = ((a, b) = extrema(x); a:b)
 
 # Variation coefficient: std / mean
 """
     variation(x, m=mean(x))
 
-Return the coefficient of variation of an array `x`, optionally specifying
+Return the coefficient of variation of collection `x`, optionally specifying
 a precomputed mean `m`. The coefficient of variation is the ratio of the
 standard deviation to the mean.
 """
-variation(x::AbstractArray{<:Real}, m::Real) = stdm(x, m) / m
-variation(x::AbstractArray{<:Real}) = variation(x, mean(x))
+variation(x, m) = stdm(x, m) / m
+variation(x) = ((m, s) = mean_and_std(x); s/m)
+
+# Standard error of the mean: std / sqrt(len)
+# Code taken from var in the Statistics stdlib module
+
+# faster computation of real(conj(x)*y)
+realXcY(x::Real, y::Real) = x*y
+realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y)
 
-# Standard error of the mean: std(a) / sqrt(len)
 """
-    sem(a)
+    sem(x)
 
-Return the standard error of the mean of `a`, i.e. `sqrt(var(a) / length(a))`.
+Return the standard error of the mean of collection `x`,
+i.e. `sqrt(var(x, corrected=true) / length(x))`.
 """
-sem(a::AbstractArray{<:Real}) = sqrt(var(a) / length(a))
+function sem(x)
+    y = iterate(x)
+    if y === nothing
+        T = eltype(x)
+        # Return the NaN of the type that we would get, had this collection
+        # contained any elements (this is consistent with std)
+        return oftype(sqrt((abs2(zero(T)) + abs2(zero(T)))/2), NaN)
+    end
+    count = 1
+    value, state = y
+    y = iterate(x, state)
+    # Use Welford algorithm as seen in (among other places)
+    # Knuth's TAOCP, Vol 2, page 232, 3rd edition.
+    M = value / 1
+    S = real(zero(M))
+    while y !== nothing
+        value, state = y
+        y = iterate(x, state)
+        count += 1
+        new_M = M + (value - M) / count
+        S = S + realXcY(value - M, value - new_M)
+        M = new_M
+    end
+    var = S / (count - 1)
+    return sqrt(var/count)
+end
 
 # Median absolute deviation
+@irrational mad_constant 1.4826022185056018 BigFloat("1.482602218505601860547076529360423431326703202590312896536266275245674447622701")
+
 """
-    mad(v; center=median(v), normalize=true)
+    mad(x; center=median(x), normalize=true)
 
-Compute the median absolute deviation (MAD) of `v` around `center`
+Compute the median absolute deviation (MAD) of collection `x` around `center`
 (by default, around the median).
 
 If `normalize` is set to `true`, the MAD is multiplied by
 `1 / quantile(Normal(), 3/4) ≈ 1.4826`, in order to obtain a consistent estimator
 of the standard deviation under the assumption that the data is normally distributed.
 """
-function mad(v::AbstractArray{T};
-             center::Union{Real,Nothing}=nothing,
-             normalize::Union{Bool, Nothing}=nothing) where T<:Real
-    isempty(v) && throw(ArgumentError("mad is not defined for empty arrays"))
-
-    S = promote_type(T, typeof(middle(first(v))))
-    v2 = LinearAlgebra.copy_oftype(v, S)
-
+function mad(x; center=nothing, normalize::Union{Bool, Nothing}=nothing, constant=nothing)
     if normalize === nothing
         Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad)
         normalize = true
     end
 
-    mad!(v2, center=center === nothing ? median!(v2) : center, normalize=normalize)
+    isempty(x) && throw(ArgumentError("mad is not defined for empty arrays"))
+    T = eltype(x)
+    # Knowing the eltype allows allocating a single array able to hold both original values
+    # and differences from the center, instead of two arrays
+    S = isconcretetype(T) ? promote_type(T, typeof(middle(zero(T)))) : T
+    x2 = x isa AbstractArray ? LinearAlgebra.copy_oftype(x, S) : collect(S, x)
+    c = center === nothing ? median!(x2) : center
+    if isconcretetype(T)
+        x2 .= abs.(x2 .- c)
+    else
+        x2 = abs.(x2 .- c)
+    end
+    m = median!(x2)
+    if normalize isa Nothing
+        Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad)
+        normalize = true
+    end
+    if !isa(constant, Nothing)
+        Base.depwarn("keyword argument `constant` is deprecated, use `normalize` instead or apply the multiplication directly", :mad)
+        m * constant
+    elseif normalize
+        m * mad_constant
+    else
+        m
+    end
 end
 
-@irrational mad_constant 1.4826022185056018 BigFloat("1.482602218505601860547076529360423431326703202590312896536266275245674447622701")
 """
-    StatsBase.mad!(v; center=median!(v), normalize=true)
+    StatsBase.mad!(x; center=median!(x), normalize=true)
 
-Compute the median absolute deviation (MAD) of `v` around `center`
-(by default, around the median), overwriting `v` in the process.
+Compute the median absolute deviation (MAD) of array `x` around `center`
+(by default, around the median), overwriting `x` in the process.
+`x` must be able to hold values of generated by calling `middle` on its elements
+(for example an integer vector is not appropriate since `middle` can produce
+non-integer values).
 
 If `normalize` is set to `true`, the MAD is multiplied by
 `1 / quantile(Normal(), 3/4) ≈ 1.4826`, in order to obtain a consistent estimator
 of the standard deviation under the assumption that the data is normally distributed.
 """
-function mad!(v::AbstractArray{<:Real};
-              center::Real=median!(v),
+function mad!(x::AbstractArray;
+              center=median!(x),
               normalize::Union{Bool,Nothing}=true,
               constant=nothing)
-    isempty(v) && throw(ArgumentError("mad is not defined for empty arrays"))
-    v .= abs.(v .- center)
-    m = median!(v)
+    isempty(x) && throw(ArgumentError("mad is not defined for empty arrays"))
+    x .= abs.(x .- center)
+    m = median!(x)
     if normalize isa Nothing
         Base.depwarn("the `normalize` keyword argument will be false by default in future releases: set it explicitly to silence this deprecation", :mad)
         normalize = true
@@ -280,19 +332,19 @@ end
 
 # Interquartile range
 """
-    iqr(v)
+    iqr(x)
 
-Compute the interquartile range (IQR) of an array, i.e. the 75th percentile
+Compute the interquartile range (IQR) of collection `x`, i.e. the 75th percentile
 minus the 25th percentile.
 """
-iqr(v::AbstractArray{<:Real}) = (q = quantile(v, [.25, .75]); q[2] - q[1])
+iqr(x) = (q = quantile(x, [.25, .75]); q[2] - q[1])
 
 # Generalized variance
 """
     genvar(X)
 
 Compute the generalized sample variance of `X`. If `X` is a vector, one-column matrix,
-or other one-dimensional iterable, this is equivalent to the sample variance.
+or other iterable, this is equivalent to the sample variance.
 Otherwise if `X` is a matrix, this is equivalent to the determinant of the covariance
 matrix of `X`.
 
@@ -308,7 +360,7 @@ genvar(itr) = var(itr)
     totalvar(X)
 
 Compute the total sample variance of `X`. If `X` is a vector, one-column matrix,
-or other one-dimensional iterable, this is equivalent to the sample variance.
+or other iterable, this is equivalent to the sample variance.
 Otherwise if `X` is a matrix, this is equivalent to the sum of the diagonal elements
 of the covariance matrix of `X`.
 """
diff --git a/test/scalarstats.jl b/test/scalarstats.jl
index 74eb9658d51908..4418f0fc31006c 100644
--- a/test/scalarstats.jl
+++ b/test/scalarstats.jl
@@ -39,6 +39,15 @@ using DelimitedFiles
 @test modes([1, 2, 3, 3, 2, 2, 1]) == [2]
 @test sort(modes([1, 3, 2, 3, 3, 2, 2, 1])) == [2, 3]
 
+@test mode(skipmissing([1, missing, missing, 3, 2, 2, missing])) == 2
+@test modes(skipmissing([1, missing, missing, 3, 2, 2, missing])) == [2]
+@test sort(modes(skipmissing([1, missing, 3, 3, 2, 2, missing]))) == [2, 3]
+
+@test_throws ArgumentError mode(Int[])
+@test_throws ArgumentError modes(Int[])
+@test_throws ArgumentError mode(Any[])
+@test_throws ArgumentError modes(Any[])
+
 ## zscores
 
 @test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]
@@ -67,26 +76,43 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.]
 
 @test nquantile(1:5, 2) ≈ [1, 3, 5]
 @test nquantile(1:5, 4) ≈ [1:5;]
+@test nquantile(skipmissing([missing, 2, 5, missing]), 2) ≈ [2.0, 3.5, 5.0]
 
 @test percentile([1:5;], 25)           ≈  2.0
 @test percentile([1:5;], [25, 50, 75]) ≈ [2.0, 3.0, 4.0]
+@test percentile(skipmissing([missing, 2, 5, missing]), 25) ≈ 2.75
+@test percentile(skipmissing([missing, 2, 5, missing]), [25, 50, 75]) ≈ [2.75, 3.5, 4.25]
 
 
 ##### Dispersion
 
 @test span([3, 4, 5, 6, 2]) == (2:6)
+@test span(skipmissing([1, missing, 5, missing])) == 1:5
 
 @test variation([1:5;]) ≈ 0.527046276694730
+@test variation(skipmissing([missing; 1:5; missing])) ≈ 0.527046276694730
 
 @test sem([1:5;]) ≈ 0.707106781186548
-
-@test StatsBase.mad(1:5; center=3, normalize=true) ≈ 1.4826022185056018
+@test sem(skipmissing([missing; 1:5; missing])) ≈ 0.707106781186548
+@test sem(Int[]) === NaN
+@test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN
+@test_throws MethodError sem(Any[])
+@test_throws MethodError sem(skipmissing([missing]))
+
+@test mad(1:5; center=3, normalize=true) ≈ 1.4826022185056018
+@test mad(skipmissing([missing; 1:5; missing]); center=3, normalize=true) ≈ 1.4826022185056018
 @test StatsBase.mad!([1:5;]; center=3, normalize=true) ≈ 1.4826022185056018
 @test mad(1:5, normalize=true) ≈ 1.4826022185056018
-@test StatsBase.mad(1:5, normalize=false) ≈ 1.0
+@test mad(1:5, normalize=false) ≈ 1.0
+@test mad(skipmissing([missing; 1:5; missing]), normalize=true) ≈ 1.4826022185056018
+@test mad(skipmissing([missing; 1:5; missing]), normalize=false) ≈ 1.0
 @test StatsBase.mad!([1:5;], normalize=false) ≈ 1.0
-@test StatsBase.mad(1:5, center=3, normalize=false) ≈ 1.0
+@test mad(1:5, center=3, normalize=false) ≈ 1.0
+@test mad(skipmissing([missing; 1:5; missing]), center=3, normalize=false) ≈ 1.0
 @test StatsBase.mad!([1:5;], center=3, normalize=false) ≈ 1.0
+@test mad((x for x in (1, 2.1)), normalize=false) ≈ 0.55
+@test mad(Any[1, 2.1], normalize=false) ≈ 0.55
+@test mad(Union{Int,Missing}[1, 2], normalize=false) ≈ 0.5
 @test_throws ArgumentError mad(Int[])
 
 # Issue 197