Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wquantile #117

Merged
merged 1 commit into from
Oct 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/source/scalarstats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,10 @@ Quantile and Friends
w = rand(n)
xk = median(x, weights(w))

.. function:: quantile(x, w, p)

Compute the weighted quantiles of a vector ``x`` at a specified set of probability values ``p``, using weights given by a weight vector ``w`` (of type ``WeightVec``). Weights must not be negative. The weights and data vectors must have the same length. The quantile for :math:`p` is defined as follows. Denoting :math:`S_k = (k-1)w_k + (n-1) \sum_{i<k}w_i`, define :math:`x_{k+1}` the smallest element of ``x`` such that :math:`S_{k+1}/S_{n}` is strictly superior to :math:`p`. The function returns :math:`(1-\gamma) x_k + \gamma x_{k+1}` with :math:`\gamma = (pS_n- S_k)/(S_{k+1}-S_k)`. This corresponds to R-7, Excel, SciPy-(1,1), Maple-6 when ``w`` is one (see https://en.wikipedia.org/wiki/Quantile).

Mode and Modes
---------------

Expand Down
1 change: 1 addition & 0 deletions src/StatsBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ module StatsBase
wmean, # weighted mean
wmean!, # weighted mean across dimensions with provided storage
wmedian, # weighted median
wquantile, # weighted quantile

## moments
skewness, # (standardized) skewness
Expand Down
81 changes: 81 additions & 0 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,84 @@ end

wmedian(v::RealVector, w::RealVector) = median(v, weights(w))
wmedian{W<:Real}(v::RealVector, w::WeightVec{W}) = median(v, w)

###### Weighted quantile #####

# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample
# In the non weighted version, we compute a vector of index h(N, p)
# and take interpolation between floor and ceil of this index
# Here there is a supplementary function from index to weighted index k -> Sk

function quantile{V, W <: Real}(v::RealVector{V}, w::WeightVec{W}, p::RealVector)

# checks
isempty(v) && error("quantile of an empty array is undefined")
isempty(p) && throw(ArgumentError("empty quantile array"))

w.sum == 0 && error("weight vector cannot sum to zero")
length(v) == length(w) || error("data and weight vectors must be the same size, got $(length(v)) and $(length(w))")
for x in w.values
isnan(x) && error("weight vector cannot contain NaN entries")
x < 0 && error("weight vector cannot contain negative entries")
end

# full sort
vw = sort!(collect(zip(v, w.values)))

wsum = w.sum

# prepare percentiles
ppermute = sortperm(p)
p = p[ppermute]
p = bound_quantiles(p)

# prepare out vector
N = length(vw)
out = Array(typeof(zero(V)/1), length(p))
fill!(out, vw[end][1])

# start looping on quantiles
cumulative_weight, Sk, Skold = zero(W), zero(W), zero(W)
vk, vkold = zero(V), zero(V)
k = 1
for i in 1:length(p)
h = p[i] * (N - 1) * wsum
if h == 0
# happens when N or p or wsum equal zero
out[ppermute[i]] = vw[1][1]
else
while Sk <= h
# happens in particular when k == 1
vk, wk = vw[k]
cumulative_weight += wk
if k >= N
# out was initialized with maximum v
return out
end
k += 1
Skold, vkold = Sk, vk
vk, wk = vw[k]
Sk = (k - 1) * wk + (N - 1) * cumulative_weight
end
# in particular, Sk is different from Skold
g = (h - Skold) / (Sk - Skold)
out[ppermute[i]] = vkold + g * (vk - vkold)
end
end
return out
end

# similarly to statistics.jl in Base
function bound_quantiles{T <: Real}(qs::AbstractVector{T})
epsilon = 100 * eps()
if (any(qs .< -epsilon) || any(qs .> 1+epsilon))
throw(ArgumentError("quantiles out of [0,1] range"))
end
T[min(one(T), max(zero(T), q)) for q = qs]
end

quantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1]
wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::RealVector) = quantile(v, w, p)
wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1]
wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p)
wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1]
101 changes: 101 additions & 0 deletions test/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,104 @@ wt = [-1, -1, -1, -1, -1]
@test_throws ErrorException median(data, weights(wt))
wt = [-1, -1, -1, 0, 0]
@test_throws ErrorException median(data, weights(wt))


# Weighted quantile tests
data = (
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10, 15],
[1, 2, 4, 7, 10, 15],
[0, 10, 20, 30],
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
[30, 40, 50, 60, 35],
[2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7],
[1, 2, 2],
[3.7, 3.3, 3.5, 2.8],
[100, 125, 123, 60, 45, 56, 66],
[2, 2, 2, 2, 2, 2],
[2.3],
[-2, -3, 1, 2, -10],
[1, 2, 3, 4, 5],
[5, 4, 3, 2, 1],
[-2, 2, -1, 3, 6],
[-10, 1, 1, -10, -10],
)
wt = (
weights([1, 1/3, 1/3, 1/3, 1]),
weights([1, 1, 1, 1, 1]),
weights([1, 1/3, 1/3, 1/3, 1, 1]),
weights([1/3, 1/3, 1/3, 1, 1, 1]),
weights([30, 191, 9, 0]),
weights([10, 1, 1, 1, 9]),
weights([10, 1, 1, 1, 900]),
weights([1, 3, 5, 4, 2]),
weights([2, 2, 5, 1, 2, 2, 1, 6]),
weights([0.1, 0.1, 0.8]),
weights([5, 5, 4, 1]),
weights([30, 56, 144, 24, 55, 43, 67]),
weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]),
weights([12]),
weights([7, 1, 1, 1, 6]),
weights([1, 0, 0, 0, 2]),
weights([1, 2, 3, 4, 5]),
weights([0.1, 0.2, 0.3, 0.2, 0.1]),
weights([1, 1, 1, 1, 1]),
)
quantile_answers = (
[1.0,3.6000000000000005,6.181818181818182,8.2,10.0],
[1.0,2.0,4.0,7.0,10.0],
[1.0,4.75,8.0,10.833333333333334,15.0],
[1.0,4.75,8.0,10.833333333333334,15.0],
[0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0],
[1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0],
[1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0],
[30.0,38.75,45.714285714285715,52.85714285714286,60.0],
[0.3,0.6903846153846154,1.484,1.7,2.0],
[1.0,2.0,2.0,2.0,2.0],
[2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7],
[45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0],
[2.0,2.0,2.0,2.0,2.0],
[2.3,2.3,2.3,2.3,2.3],
[-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0],
[1.0,1.75,4.25,4.625,5.0],
[1.0,1.625,2.3333333333333335,3.25,5.0],
[-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0],
[-10.0,-10.0,-10.0,1.0,1.0]
)
p = [0.0, 0.25, 0.5, 0.75, 1.0]

srand(10)
for i = 1:length(data)
@test_approx_eq quantile(data[i], wt[i], p) quantile_answers[i]
for j = 1:10
# order of p does not matter
reorder = sortperm(rand(length(p)))
@test_approx_eq quantile(data[i], wt[i], p[reorder]) quantile_answers[i][reorder]
end
for j = 1:10
# order of w does not matter
reorder = sortperm(rand(length(data[i])))
@test_approx_eq quantile(data[i][reorder], weights(wt[i][reorder]), p) quantile_answers[i]
end
end
# w = 1 corresponds to base quantile
for i = 1:length(data)
@test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))), p) quantile(data[i], p)
for j = 1:10
prandom = rand(4)
@test_approx_eq quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) quantile(data[i], prandom)
end
end

# other syntaxes
v = [7, 1, 2, 4, 10]
w = [1, 1/3, 1/3, 1/3, 1]
answer = 6.181818181818182
@test_approx_eq quantile(data[1], weights(w), 0.5) answer
@test_approx_eq wquantile(data[1], weights(w), [0.5]) [answer]
@test_approx_eq wquantile(data[1], weights(w), 0.5) answer
@test_approx_eq wquantile(data[1], w, [0.5]) [answer]
@test_approx_eq wquantile(data[1], w, 0.5) answer