diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst index 96db83c34..382296d75 100644 --- a/docs/source/scalarstats.rst +++ b/docs/source/scalarstats.rst @@ -169,7 +169,10 @@ Quantile and Friends w = rand(n) xk = median(x, weights(w)) +.. function:: quantile(x, w, p) + Compute the weighted quantiles of a vector ``x`` at a specified set of probability values ``p``, using weights given by a weight vector ``w`` (of type ``WeightVec``). Weights must not be negative. The weights and data vectors must have the same length. The quantile for :math:`p` is defined as follows. Denoting :math:`S_k = (k-1)w_k + (n-1) \sum_{i Sk + +function quantile{V, W <: Real}(v::RealVector{V}, w::WeightVec{W}, p::RealVector) + + # checks + isempty(v) && error("quantile of an empty array is undefined") + isempty(p) && throw(ArgumentError("empty quantile array")) + + w.sum == 0 && error("weight vector cannot sum to zero") + length(v) == length(w) || error("data and weight vectors must be the same size, got $(length(v)) and $(length(w))") + for x in w.values + isnan(x) && error("weight vector cannot contain NaN entries") + x < 0 && error("weight vector cannot contain negative entries") + end + + # full sort + vw = sort!(collect(zip(v, w.values))) + + wsum = w.sum + + # prepare percentiles + ppermute = sortperm(p) + p = p[ppermute] + p = bound_quantiles(p) + + # prepare out vector + N = length(vw) + out = Array(typeof(zero(V)/1), length(p)) + fill!(out, vw[end][1]) + + # start looping on quantiles + cumulative_weight, Sk, Skold = zero(W), zero(W), zero(W) + vk, vkold = zero(V), zero(V) + k = 1 + for i in 1:length(p) + h = p[i] * (N - 1) * wsum + if h == 0 + # happens when N or p or wsum equal zero + out[ppermute[i]] = vw[1][1] + else + while Sk <= h + # happens in particular when k == 1 + vk, wk = vw[k] + cumulative_weight += wk + if k >= N + # out was initialized with maximum v + return out + end + k += 1 + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk = (k - 1) * wk + (N - 1) * cumulative_weight + end + # in particular, Sk is different from Skold + g = (h - Skold) / (Sk - Skold) + out[ppermute[i]] = vkold + g * (vk - vkold) + end + end + return out +end + +# similarly to statistics.jl in Base +function bound_quantiles{T <: Real}(qs::AbstractVector{T}) + epsilon = 100 * eps() + if (any(qs .< -epsilon) || any(qs .> 1+epsilon)) + throw(ArgumentError("quantiles out of [0,1] range")) + end + T[min(one(T), max(zero(T), q)) for q = qs] +end + +quantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1] +wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::RealVector) = quantile(v, w, p) +wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] diff --git a/test/weights.jl b/test/weights.jl index 40da881f8..6ed0caf5e 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -233,3 +233,104 @@ wt = [-1, -1, -1, -1, -1] @test_throws ErrorException median(data, weights(wt)) wt = [-1, -1, -1, 0, 0] @test_throws ErrorException median(data, weights(wt)) + + +# Weighted quantile tests +data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], +) +wt = ( + weights([1, 1/3, 1/3, 1/3, 1]), + weights([1, 1, 1, 1, 1]), + weights([1, 1/3, 1/3, 1/3, 1, 1]), + weights([1/3, 1/3, 1/3, 1, 1, 1]), + weights([30, 191, 9, 0]), + weights([10, 1, 1, 1, 9]), + weights([10, 1, 1, 1, 900]), + weights([1, 3, 5, 4, 2]), + weights([2, 2, 5, 1, 2, 2, 1, 6]), + weights([0.1, 0.1, 0.8]), + weights([5, 5, 4, 1]), + weights([30, 56, 144, 24, 55, 43, 67]), + weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + weights([12]), + weights([7, 1, 1, 1, 6]), + weights([1, 0, 0, 0, 2]), + weights([1, 2, 3, 4, 5]), + weights([0.1, 0.2, 0.3, 0.2, 0.1]), + weights([1, 1, 1, 1, 1]), +) +quantile_answers = ( + [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], + [1.0,2.0,4.0,7.0,10.0], + [1.0,4.75,8.0,10.833333333333334,15.0], + [1.0,4.75,8.0,10.833333333333334,15.0], + [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0], + [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0], + [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0], + [30.0,38.75,45.714285714285715,52.85714285714286,60.0], + [0.3,0.6903846153846154,1.484,1.7,2.0], + [1.0,2.0,2.0,2.0,2.0], + [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7], + [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0], + [2.0,2.0,2.0,2.0,2.0], + [2.3,2.3,2.3,2.3,2.3], + [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0], + [1.0,1.75,4.25,4.625,5.0], + [1.0,1.625,2.3333333333333335,3.25,5.0], + [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0], + [-10.0,-10.0,-10.0,1.0,1.0] +) +p = [0.0, 0.25, 0.5, 0.75, 1.0] + +srand(10) +for i = 1:length(data) + @test_approx_eq quantile(data[i], wt[i], p) quantile_answers[i] + for j = 1:10 + # order of p does not matter + reorder = sortperm(rand(length(p))) + @test_approx_eq quantile(data[i], wt[i], p[reorder]) quantile_answers[i][reorder] + end + for j = 1:10 + # order of w does not matter + reorder = sortperm(rand(length(data[i]))) + @test_approx_eq quantile(data[i][reorder], weights(wt[i][reorder]), p) quantile_answers[i] + end +end +# w = 1 corresponds to base quantile +for i = 1:length(data) + @test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))), p) quantile(data[i], p) + for j = 1:10 + prandom = rand(4) + @test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) quantile(data[i], prandom) + end +end + +# other syntaxes +v = [7, 1, 2, 4, 10] +w = [1, 1/3, 1/3, 1/3, 1] +answer = 6.181818181818182 +@test_approx_eq quantile(data[1], weights(w), 0.5) answer +@test_approx_eq wquantile(data[1], weights(w), [0.5]) [answer] +@test_approx_eq wquantile(data[1], weights(w), 0.5) answer +@test_approx_eq wquantile(data[1], w, [0.5]) [answer] +@test_approx_eq wquantile(data[1], w, 0.5) answer +