JuliaStats · johnmyleswhite · Oct 6, 2015 · Oct 2, 2015
diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst
@@ -169,7 +169,10 @@ Quantile and Friends
     w = rand(n)
     xk = median(x, weights(w))
 
+.. function:: quantile(x, w, p)
 
+  Compute the weighted quantiles of a vector ``x`` at a specified set of probability values ``p``, using weights given by a weight vector ``w`` (of type ``WeightVec``).  Weights must not be negative. The weights and data vectors must have the same length. The quantile for :math:`p` is defined as follows.  Denoting :math:`S_k = (k-1)w_k + (n-1) \sum_{i<k}w_i`, define :math:`x_{k+1}` the smallest element of ``x`` such that :math:`S_{k+1}/S_{n}` is strictly superior to :math:`p`. The function returns :math:`(1-\gamma) x_k + \gamma x_{k+1}` with  :math:`\gamma = (pS_n- S_k)/(S_{k+1}-S_k)`. This corresponds to  R-7, Excel, SciPy-(1,1), Maple-6 when ``w`` is one (see https://en.wikipedia.org/wiki/Quantile).
+
 Mode and Modes
 ---------------
 

diff --git a/src/StatsBase.jl b/src/StatsBase.jl
@@ -52,6 +52,7 @@ module StatsBase
     wmean,       # weighted mean
     wmean!,      # weighted mean across dimensions with provided storage
     wmedian,     # weighted median
+    wquantile,   # weighted quantile
 
     ## moments
     skewness,       # (standardized) skewness

diff --git a/src/weights.jl b/src/weights.jl
@@ -293,3 +293,84 @@ end
 
 wmedian(v::RealVector, w::RealVector) = median(v, weights(w))
 wmedian{W<:Real}(v::RealVector, w::WeightVec{W}) = median(v, w)
+
+###### Weighted quantile #####
+
+# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample
+# In the non weighted version, we compute a vector of index h(N, p)
+# and take interpolation between floor and ceil of this index
+# Here there is a supplementary function from index to weighted index k -> Sk
+
+function quantile{V, W <: Real}(v::RealVector{V}, w::WeightVec{W}, p::RealVector)
+
+    # checks
+    isempty(v) && error("quantile of an empty array is undefined")
+    isempty(p) && throw(ArgumentError("empty quantile array"))
+
+    w.sum == 0 && error("weight vector cannot sum to zero")
+    length(v) == length(w) || error("data and weight vectors must be the same size, got $(length(v)) and $(length(w))")
+    for x in w.values
+        isnan(x) && error("weight vector cannot contain NaN entries")
+        x < 0 && error("weight vector cannot contain negative entries")
+    end
+
+    # full sort
+    vw = sort!(collect(zip(v, w.values)))
+
+    wsum = w.sum
+
+    # prepare percentiles
+    ppermute = sortperm(p)
+    p = p[ppermute]
+    p = bound_quantiles(p)
+
+    # prepare out vector
+    N = length(vw)
+    out = Array(typeof(zero(V)/1), length(p))
+    fill!(out, vw[end][1])
+
+    # start looping on quantiles
+    cumulative_weight, Sk, Skold =  zero(W), zero(W), zero(W)
+    vk, vkold = zero(V), zero(V)
+    k = 1
+    for i in 1:length(p)
+        h = p[i] * (N - 1) * wsum
+        if h == 0
+            # happens when N or p or wsum equal zero
+            out[ppermute[i]] = vw[1][1]     
+        else
+            while Sk <= h
+                # happens in particular when k == 1
+                vk, wk = vw[k]
+                cumulative_weight += wk
+                if k >= N 
+                    # out was initialized with maximum v
+                    return out
+                end
+                k += 1
+                Skold, vkold = Sk, vk
+                vk, wk = vw[k]
+                Sk = (k - 1) * wk + (N - 1) * cumulative_weight
+            end
+            # in particular, Sk is different from Skold
+            g = (h - Skold) / (Sk - Skold)
+            out[ppermute[i]] = vkold + g * (vk - vkold)
+        end
+    end
+    return out
+end
+
+# similarly to statistics.jl in Base
+function bound_quantiles{T <: Real}(qs::AbstractVector{T})
+    epsilon = 100 * eps()
+    if (any(qs .< -epsilon) || any(qs .> 1+epsilon))
+        throw(ArgumentError("quantiles out of [0,1] range"))
+    end
+    T[min(one(T), max(zero(T), q)) for q = qs]
+end
+
+quantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1]
+wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::RealVector) = quantile(v, w, p)
+wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1]
+wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p)
+wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1]
diff --git a/test/weights.jl b/test/weights.jl
@@ -233,3 +233,104 @@ wt = [-1, -1, -1, -1, -1]
 @test_throws ErrorException median(data, weights(wt))
 wt = [-1, -1, -1, 0, 0]
 @test_throws ErrorException median(data, weights(wt))
+
+
+# Weighted quantile tests
+data = (
+    [7, 1, 2, 4, 10],
+    [7, 1, 2, 4, 10],
+    [7, 1, 2, 4, 10, 15],
+    [1, 2, 4, 7, 10, 15],
+    [0, 10, 20, 30],
+    [1, 2, 3, 4, 5],
+    [1, 2, 3, 4, 5],
+    [30, 40, 50, 60, 35],
+    [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7],
+    [1, 2, 2],
+    [3.7, 3.3, 3.5, 2.8],
+    [100, 125, 123, 60, 45, 56, 66],
+    [2, 2, 2, 2, 2, 2],
+    [2.3],
+    [-2, -3, 1, 2, -10],
+    [1, 2, 3, 4, 5],
+    [5, 4, 3, 2, 1],
+    [-2, 2, -1, 3, 6],
+    [-10, 1, 1, -10, -10],
+)
+wt = (
+    weights([1, 1/3, 1/3, 1/3, 1]),
+    weights([1, 1, 1, 1, 1]),
+    weights([1, 1/3, 1/3, 1/3, 1, 1]),
+    weights([1/3, 1/3, 1/3, 1, 1, 1]),
+    weights([30, 191, 9, 0]),
+    weights([10, 1, 1, 1, 9]),
+    weights([10, 1, 1, 1, 900]),
+    weights([1, 3, 5, 4, 2]),
+    weights([2, 2, 5, 1, 2, 2, 1, 6]),
+    weights([0.1, 0.1, 0.8]),
+    weights([5, 5, 4, 1]),
+    weights([30, 56, 144, 24, 55, 43, 67]),
+    weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]),
+    weights([12]),
+    weights([7, 1, 1, 1, 6]),
+    weights([1, 0, 0, 0, 2]),
+    weights([1, 2, 3, 4, 5]),
+    weights([0.1, 0.2, 0.3, 0.2, 0.1]),
+    weights([1, 1, 1, 1, 1]),
+)
+quantile_answers = (    
+    [1.0,3.6000000000000005,6.181818181818182,8.2,10.0],
+    [1.0,2.0,4.0,7.0,10.0],
+    [1.0,4.75,8.0,10.833333333333334,15.0],
+    [1.0,4.75,8.0,10.833333333333334,15.0],
+    [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0],
+    [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0],
+    [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0],
+    [30.0,38.75,45.714285714285715,52.85714285714286,60.0],
+    [0.3,0.6903846153846154,1.484,1.7,2.0],
+    [1.0,2.0,2.0,2.0,2.0],
+    [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7],
+    [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0],
+    [2.0,2.0,2.0,2.0,2.0],
+    [2.3,2.3,2.3,2.3,2.3],
+    [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0],
+    [1.0,1.75,4.25,4.625,5.0],
+    [1.0,1.625,2.3333333333333335,3.25,5.0],
+    [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0],
+    [-10.0,-10.0,-10.0,1.0,1.0]
+)
+p = [0.0, 0.25, 0.5, 0.75, 1.0]
+
+srand(10)
+for i = 1:length(data)
+    @test_approx_eq quantile(data[i], wt[i], p) quantile_answers[i]
+    for j = 1:10
+        # order of p does not matter
+        reorder = sortperm(rand(length(p)))
+        @test_approx_eq quantile(data[i], wt[i], p[reorder]) quantile_answers[i][reorder]
+    end
+    for j = 1:10
+        # order of w does not matter
+        reorder = sortperm(rand(length(data[i])))
+        @test_approx_eq quantile(data[i][reorder], weights(wt[i][reorder]), p) quantile_answers[i]
+    end
+end
+# w = 1 corresponds to base quantile
+for i = 1:length(data)
+    @test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))), p) quantile(data[i], p)
+    for j = 1:10
+        prandom = rand(4)
+        @test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))),  prandom) quantile(data[i], prandom)
+    end
+end
+
+# other syntaxes
+v = [7, 1, 2, 4, 10]
+w = [1, 1/3, 1/3, 1/3, 1]
+answer = 6.181818181818182
+@test_approx_eq  quantile(data[1], weights(w), 0.5) answer
+@test_approx_eq  wquantile(data[1], weights(w), [0.5]) [answer]
+@test_approx_eq  wquantile(data[1], weights(w), 0.5) answer
+@test_approx_eq  wquantile(data[1], w, [0.5]) [answer]
+@test_approx_eq  wquantile(data[1], w, 0.5)  answer
+