Skip to content

Commit

Permalink
Merge pull request #6273 from lindahua/dh/cov2
Browse files Browse the repository at this point in the history
New implementation of cov/cor with extended API
lindahua committed Mar 30, 2014

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents f684174 + 7513a20 commit 5c72672
Showing 2 changed files with 466 additions and 94 deletions.
385 changes: 305 additions & 80 deletions base/statistics.jl
Original file line number Diff line number Diff line change
@@ -44,9 +44,24 @@ end
median{T<:Real}(v::AbstractArray{T}; checknan::Bool=true) =
median!(vec(copy(v)), checknan=checknan)

## variance with known mean, using pairwise summation
function varm_pairwise(A::AbstractArray, m, i1,n) # see sum_pairwise
if n < 128

## variances

function varzm_pairwise(A::AbstractArray, i1::Int, n::Int)
if n < 256
@inbounds s = abs2(A[i1])
for i=i1+1:i1+n-1
@inbounds s += abs2(A[i])
end
return s
else
n2 = div(n,2)
return varzm_pairwise(A, i1, n2) + varzm_pairwise(A, i1+n2, n-n2)
end
end

function varm_pairwise(A::AbstractArray, m::Number, i1::Int, n::Int) # see sum_pairwise
if n < 256
@inbounds s = abs2(A[i1] - m)
for i = i1+1:i1+n-1
@inbounds s += abs2(A[i] - m)
@@ -57,16 +72,51 @@ function varm_pairwise(A::AbstractArray, m, i1,n) # see sum_pairwise
return varm_pairwise(A, m, i1, n2) + varm_pairwise(A, m, i1+n2, n-n2)
end
end
function varm(v::AbstractArray, m::Number)

sumabs2(v::AbstractArray) = varzm_pairwise(v, 1, length(v))
sumabs2(v::AbstractArray, region) = sum(abs2(v), region)

function varzm(v::AbstractArray; corrected::Bool=true)
n = length(v)
if n == 0 || n == 1
return NaN
end
return varm_pairwise(v, m, 1,n) / (n - 1)
n == 0 && return NaN
return sumabs2(v) / (n - int(corrected))
end

function varzm(v::AbstractArray, region; corrected::Bool=true)
cn = regionsize(v, region) - int(corrected)
sumabs2(v, region) / cn
end

function varm(v::AbstractArray, m::Number; corrected::Bool=true)
n = length(v)
n == 0 && return NaN
return varm_pairwise(v, m, 1, n) / (n - int(corrected))
end

function varm(v::AbstractArray, m::AbstractArray, region; corrected::Bool=true)
cn = regionsize(v, region) - int(corrected)
sumabs2(v .- m, region) / cn
end

function var(v::AbstractArray; corrected::Bool=true, mean=nothing)
mean == 0 ? varzm(v; corrected=corrected) :
mean == nothing ? varm(v, Base.mean(v); corrected=corrected) :
isa(mean, Number) ? varm(v, mean; corrected=corrected) :
error("Invalid value of mean.")
end

function var(v::AbstractArray, region; corrected::Bool=true, mean=nothing)
mean == 0 ? varzm(v, region; corrected=corrected) :
mean == nothing ? varm(v, Base.mean(v, region), region; corrected=corrected) :
isa(mean, AbstractArray) ? varm(v, mean, region; corrected=corrected) :
error("Invalid value of mean.")
end


## variances over ranges

varm(v::Ranges, m::Number) = var(v)

## variance
function var(v::Ranges)
s = step(v)
l = length(v)
@@ -75,20 +125,257 @@ function var(v::Ranges)
end
return abs2(s) * (l + 1) * l / 12
end
var(v::AbstractArray) = varm(v, mean(v))
function var(v::AbstractArray, region)
x = v .- mean(v, region)
return sum(abs2(x), region) / (regionsize(v,region) - 1)

## standard deviation

function sqrt!(v::AbstractArray)
for i = 1:length(v)
v[i] = sqrt(v[i])
end
v
end

stdm(v::AbstractArray, m::Number; corrected::Bool=true) =
sqrt(varm(v, m; corrected=corrected))

std(v::AbstractArray; corrected::Bool=true, mean=nothing) =
sqrt(var(v; corrected=corrected, mean=mean))

std(v::AbstractArray, region; corrected::Bool=true, mean=nothing) =
sqrt!(var(v, region; corrected=corrected, mean=mean))


## pearson covariance functions ##

# auxiliary functions

_conj{T<:Real}(x::AbstractArray{T}) = x
_conj(x::AbstractArray) = conj(x)

_getnobs(x::AbstractVector, vardim::Int) = length(x)
_getnobs(x::AbstractMatrix, vardim::Int) = size(x, vardim)

function _getnobs(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int)
n = _getnobs(x, vardim)
_getnobs(y, vardim) == n || throw(DimensionMismatch("Dimensions of x and y mismatch."))
return n
end

## standard deviation with known mean
stdm(v, m::Number) = sqrt(varm(v, m))
_vmean(x::AbstractVector, vardim::Int) = mean(x)
_vmean(x::AbstractMatrix, vardim::Int) = mean(x, vardim)

## standard deviation
std(v) = sqrt(var(v))
std(v, region) = sqrt(var(v, region))

# core functions

unscaled_covzm(x::AbstractVector) = dot(x, x)
unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x')

unscaled_covzm(x::AbstractVector, y::AbstractVector) = dot(x, y)
unscaled_covzm(x::AbstractVector, y::AbstractMatrix, vardim::Int) =
(vardim == 1 ? At_mul_B(x, _conj(y)) : At_mul_Bt(x, _conj(y)))
unscaled_covzm(x::AbstractMatrix, y::AbstractVector, vardim::Int) =
(c = vardim == 1 ? At_mul_B(x, _conj(y)) : x * _conj(y); reshape(c, length(c), 1))
unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =
(vardim == 1 ? At_mul_B(x, _conj(y)) : A_mul_Bc(x, y))

# covzm (with centered data)

covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x, x) / (length(x) - int(corrected))

covzm(x::AbstractMatrix; vardim::Int=1, corrected::Bool=true) =
scale!(unscaled_covzm(x, vardim), inv(size(x,vardim) - int(corrected)))

covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
unscaled_covzm(x, y) / (length(x) - int(corrected))

covzm(x::AbstractVecOrMat, y::AbstractVecOrMat; vardim::Int=1, corrected::Bool=true) =
scale!(unscaled_covzm(x, y, vardim), inv(_getnobs(x, y, vardim) - int(corrected)))

# covm (with provided mean)

covm(x::AbstractVector, xmean; corrected::Bool=true) =
covzm(x .- xmean; corrected=corrected)

covm(x::AbstractMatrix, xmean; vardim::Int=1, corrected::Bool=true) =
covzm(x .- xmean; vardim=vardim, corrected=corrected)

covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean; corrected=corrected)

covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean; vardim::Int=1, corrected::Bool=true) =
covzm(x .- xmean, y .- ymean; vardim=vardim, corrected=corrected)

# cov (API)

function cov(x::AbstractVector; corrected::Bool=true, mean=nothing)
mean == 0 ? covzm(x; corrected=corrected) :
mean == nothing ? covm(x, Base.mean(x); corrected=corrected) :
isa(mean, Number) ? covm(x, mean; corrected=corrected) :
error("Invalid value of mean.")
end

function cov(x::AbstractMatrix; vardim::Int=1, corrected::Bool=true, mean=nothing)
mean == 0 ? covzm(x; vardim=vardim, corrected=corrected) :
mean == nothing ? covm(x, _vmean(x, vardim); vardim=vardim, corrected=corrected) :
isa(mean, AbstractArray) ? covm(x, mean; vardim=vardim, corrected=corrected) :
error("Invalid value of mean.")
end

function cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true, mean=nothing)
mean == 0 ? covzm(x, y; corrected=corrected) :
mean == nothing ? covm(x, Base.mean(x), y, Base.mean(y); corrected=corrected) :
isa(mean, (Number,Number)) ? covm(x, mean[1], y, mean[2]; corrected=corrected) :
error("Invalid value of mean.")
end

function cov(x::AbstractVecOrMat, y::AbstractVecOrMat; vardim::Int=1, corrected::Bool=true, mean=nothing)
if mean == 0
covzm(x, y; vardim=vardim, corrected=corrected)
elseif mean == nothing
covm(x, _vmean(x, vardim), y, _vmean(y, vardim); vardim=vardim, corrected=corrected)
elseif isa(mean, (Any,Any))
covm(x, mean[1], y, mean[2]; vardim=vardim, corrected=corrected)
else
error("Invalid value of mean.")
end
end

# cov2cor!

function cov2cor!{T}(C::AbstractMatrix{T}, xsd::AbstractArray)
nx = length(xsd)
size(C) == (nx, nx) || throw(DimensionMismatch("Inconsistent dimensions."))
for j = 1:nx
for i = 1:j-1
C[i,j] = C[j,i]
end
C[j,j] = one(T)
for i = j+1:nx
C[i,j] /= (xsd[i] * xsd[j])
end
end
return C
end

function cov2cor!(C::AbstractMatrix, xsd::Number, ysd::AbstractArray)
nx, ny = size(C)
length(ysd) == ny || throw(DimensionMismatch("Inconsistent dimensions."))
for j = 1:ny
for i = 1:nx
C[i,j] /= (xsd * ysd[j])
end
end
return C
end

function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::Number)
nx, ny = size(C)
length(xsd) == nx || throw(DimensionMismatch("Inconsistent dimensions."))
for j = 1:ny
for i = 1:nx
C[i,j] /= (xsd[i] * ysd)
end
end
return C
end

function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
nx, ny = size(C)
(length(xsd) == nx && length(ysd) == ny) ||
throw(DimensionMismatch("Inconsistent dimensions."))
for j = 1:ny
for i = 1:nx
C[i,j] /= (xsd[i] * ysd[j])
end
end
return C
end


# # corzm (non-exported, with centered data)

corzm{T}(x::AbstractVector{T}) = float(one(T) * one(T))

corzm(x::AbstractMatrix; vardim::Int=1) =
(c = unscaled_covzm(x, vardim); cov2cor!(c, sqrt!(diag(c))))

function corzm(x::AbstractVector, y::AbstractVector)
n = length(x)
length(y) == n || throw(DimensionMismatch("Inconsistent lengths."))
x1 = x[1]
y1 = y[1]
xx = abs2(x1)
yy = abs2(y1)
xy = x1 * conj(y1)
i = 1
while i < n
i += 1
@inbounds xi = x[i]
@inbounds yi = y[i]
xx += abs2(xi)
yy += abs2(yi)
xy += xi * conj(yi)
end
return xy / (sqrt(xx) * sqrt(yy))
end

corzm(x::AbstractVector, y::AbstractMatrix; vardim::Int=1) =
cov2cor!(unscaled_covzm(x, y, vardim), sqrt(sumabs2(x)), sqrt!(sumabs2(y, vardim)))

corzm(x::AbstractMatrix, y::AbstractVector; vardim::Int=1) =
cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sumabs2(x, vardim)), sqrt(sumabs2(y)))

corzm(x::AbstractMatrix, y::AbstractMatrix; vardim::Int=1) =
cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sumabs2(x, vardim)), sqrt!(sumabs2(y, vardim)))

# corm

corm(x::AbstractVector, xmean) = corzm(x .- xmean)

corm(x::AbstractMatrix, xmean; vardim::Int=1) = corzm(x .- xmean; vardim=vardim)

corm(x::AbstractVector, xmean, y::AbstractVector, ymean) = corzm(x .- xmean, y .- ymean)

corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean; vardim::Int=1) =
corzm(x .- xmean, y .- ymean; vardim=vardim)

# cor

function cor(x::AbstractVector; mean=nothing)
mean == 0 ? corzm(x) :
mean == nothing ? corm(x, Base.mean(x)) :
isa(mean, Number) ? corm(x, mean) :
error("Invalid value of mean.")
end

function cor(x::AbstractMatrix; vardim::Int=1, mean=nothing)
mean == 0 ? corzm(x; vardim=vardim) :
mean == nothing ? corm(x, _vmean(x, vardim); vardim=vardim) :
isa(mean, AbstractArray) ? corm(x, mean; vardim=vardim) :
error("Invalid value of mean.")
end

function cor(x::AbstractVector, y::AbstractVector; mean=nothing)
mean == 0 ? corzm(x, y) :
mean == nothing ? corm(x, Base.mean(x), y, Base.mean(y)) :
isa(mean, (Number,Number)) ? corm(x, mean[1], y, mean[2]) :
error("Invalid value of mean.")
end

function cor(x::AbstractVecOrMat, y::AbstractVecOrMat; vardim::Int=1, mean=nothing)
if mean == 0
corzm(x, y; vardim=vardim)
elseif mean == nothing
corm(x, _vmean(x, vardim), y, _vmean(y, vardim); vardim=vardim)
elseif isa(mean, (Any,Any))
corm(x, mean[1], y, mean[2]; vardim=vardim)
else
error("Invalid value of mean.")
end
end

## nice-valued ranges for histograms

function histrange{T<:FloatingPoint,N}(v::AbstractArray{T,N}, n::Integer)
if length(v) == 0
return Range(0.0,1.0,1)
@@ -213,68 +500,6 @@ hist2d(v::AbstractMatrix, n1::Integer, n2::Integer) =
hist2d(v::AbstractMatrix, n::Integer) = hist2d(v, n, n)
hist2d(v::AbstractMatrix) = hist2d(v, sturges(size(v,1)))

## pearson covariance functions ##

function center(x::AbstractMatrix)
m,n = size(x)
res = Array(promote_type(eltype(x),Float64), size(x))
for j in 1:n
colmean = mean(x[:,j])
for i in 1:m
res[i,j] = x[i,j] - colmean
end
end
res
end

function center(x::AbstractVector)
colmean = mean(x)
res = Array(promote_type(eltype(x),Float64), size(x))
for i in 1:length(x)
res[i] = x[i] - colmean
end
res
end

function cov(x::AbstractVecOrMat, y::AbstractVecOrMat)
size(x, 1)==size(y, 1) || throw(DimensionMismatch())
n = size(x, 1)
xc = center(x)
yc = center(y)
conj(xc' * yc / (n - 1))
end
cov(x::AbstractVector, y::AbstractVector) = cov(x'', y)[1]

function cov(x::AbstractVecOrMat)
n = size(x, 1)
xc = center(x)
conj(xc' * xc / (n - 1))
end
cov(x::AbstractVector) = cov(x'')[1]

function cor(x::AbstractVecOrMat, y::AbstractVecOrMat)
z = cov(x, y)
scale = mapslices(std, x, 1)'*mapslices(std, y, 1)
z ./ scale
end
cor(x::AbstractVector, y::AbstractVector) =
cov(x, y) / std(x) / std(y)


function cor(x::AbstractVecOrMat)
res = cov(x)
n = size(res, 1)
scale = 1 / sqrt(diag(res))
for j in 1:n
for i in 1 : j - 1
res[i,j] *= scale[i] * scale[j]
res[j,i] = res[i,j]
end
res[j,j] = 1.0
end
res
end
cor(x::AbstractVector) = cor(x'')[1]

## quantiles ##

175 changes: 161 additions & 14 deletions test/statistics.jl
Original file line number Diff line number Diff line change
@@ -19,12 +19,167 @@
@test mean([1,2,3]) == 2.
@test mean([0 1 2; 4 5 6], 1) == [2. 3. 4.]
@test mean([1 2 3; 4 5 6], 1) == [2.5 3.5 4.5]
@test var([1,2,3]) == 1.

# test var & std

@test var(1:8) == 6.
@test var([1 2 3 4 5; 6 7 8 9 10], 2) == [2.5 2.5]'
@test varm([1,2,3], 2) == 1.
@test std([1,2,3]) == 1.
@test stdm([1,2,3], 2) == 1.

@test_approx_eq varm([1,2,3], 2) 1.
@test_approx_eq var([1,2,3]) 1.
@test_approx_eq var([1,2,3]; corrected=false) 2.0/3
@test_approx_eq var([1,2,3]; mean=0) 7.
@test_approx_eq var([1,2,3]; mean=0, corrected=false) 14.0/3

@test_approx_eq var([1 2 3 4 5; 6 7 8 9 10], 2) [2.5 2.5]'
@test_approx_eq var([1 2 3 4 5; 6 7 8 9 10], 2; corrected=false) [2.0 2.0]'

@test_approx_eq stdm([1,2,3], 2) 1.
@test_approx_eq std([1,2,3]) 1.
@test_approx_eq std([1,2,3]; corrected=false) sqrt(2.0/3)
@test_approx_eq std([1,2,3]; mean=0) sqrt(7.0)
@test_approx_eq std([1,2,3]; mean=0, corrected=false) sqrt(14.0/3)

@test_approx_eq std([1 2 3 4 5; 6 7 8 9 10], 2) sqrt([2.5 2.5]')
@test_approx_eq std([1 2 3 4 5; 6 7 8 9 10], 2; corrected=false) sqrt([2.0 2.0]')

A = Complex128[exp(i*im) for i in 1:10^4]
@test_approx_eq varm(A,0.) sum(map(abs2,A))/(length(A)-1)
@test_approx_eq varm(A,mean(A)) var(A)

# test covariance

function safe_cov(x, y, zm::Bool, cr::Bool)
n = length(x)
if !zm
x = x .- mean(x)
y = y .- mean(y)
end
dot(vec(x), vec(y)) / (n - int(cr))
end

X = [1. 2. 3. 4. 5.; 5. 4. 6. 2. 1.]'
Y = [6. 1. 5. 3. 2.; 2. 7. 8. 4. 3.]'

for vd in [1, 2], zm in [true, false], cr in [true, false]
# println("vd = $vd: zm = $zm, cr = $cr")
if vd == 1
k = size(X, 2)
Cxx = zeros(k, k)
Cxy = zeros(k, k)
for i = 1:k, j = 1:k
Cxx[i,j] = safe_cov(X[:,i], X[:,j], zm, cr)
Cxy[i,j] = safe_cov(X[:,i], Y[:,j], zm, cr)
end
x1 = vec(X[:,1])
y1 = vec(Y[:,1])
else
k = size(X, 1)
Cxx = zeros(k, k)
Cxy = zeros(k, k)
for i = 1:k, j = 1:k
Cxx[i,j] = safe_cov(X[i,:], X[j,:], zm, cr)
Cxy[i,j] = safe_cov(X[i,:], Y[j,:], zm, cr)
end
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end

c = zm ? cov(x1; mean=0, corrected=cr) :
cov(x1; corrected=cr)
@test isa(c, Float64)
@test_approx_eq c Cxx[1,1]

C = zm ? cov(X; vardim=vd, mean=0, corrected=cr) :
cov(X; vardim=vd, corrected=cr)
@test size(C) == (k, k)
@test_approx_eq C Cxx

c = zm ? cov(x1, y1; mean=0, corrected=cr) :
cov(x1, y1; corrected=cr)
@test isa(c, Float64)
@test_approx_eq c Cxy[1,1]

C = zm ? cov(x1, Y; vardim=vd, mean=0, corrected=cr) :
cov(x1, Y; vardim=vd, corrected=cr)
@test size(C) == (1, k)
@test_approx_eq C Cxy[1,:]

C = zm ? cov(X, y1; vardim=vd, mean=0, corrected=cr) :
cov(X, y1; vardim=vd, corrected=cr)
@test size(C) == (k, 1)
@test_approx_eq C Cxy[:,1]

C = zm ? cov(X, Y; vardim=vd, mean=0, corrected=cr) :
cov(X, Y; vardim=vd, corrected=cr)
@test size(C) == (k, k)
@test_approx_eq C Cxy
end

# test correlation

function safe_cor(x, y, zm::Bool)
if !zm
x = x .- mean(x)
y = y .- mean(y)
end
x = vec(x)
y = vec(y)
dot(x, y) / (sqrt(dot(x, x)) * sqrt(dot(y, y)))
end

for vd in [1, 2], zm in [true, false]
# println("vd = $vd: zm = $zm")
if vd == 1
k = size(X, 2)
Cxx = zeros(k, k)
Cxy = zeros(k, k)
for i = 1:k, j = 1:k
Cxx[i,j] = safe_cor(X[:,i], X[:,j], zm)
Cxy[i,j] = safe_cor(X[:,i], Y[:,j], zm)
end
x1 = vec(X[:,1])
y1 = vec(Y[:,1])
else
k = size(X, 1)
Cxx = zeros(k, k)
Cxy = zeros(k, k)
for i = 1:k, j = 1:k
Cxx[i,j] = safe_cor(X[i,:], X[j,:], zm)
Cxy[i,j] = safe_cor(X[i,:], Y[j,:], zm)
end
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end

c = zm ? cor(x1; mean=0) : cor(x1)
@test isa(c, Float64)
@test_approx_eq c Cxx[1,1]

C = zm ? cor(X; vardim=vd, mean=0) : cor(X; vardim=vd)
@test size(C) == (k, k)
@test_approx_eq C Cxx

c = zm ? cor(x1, y1; mean=0) : cor(x1, y1)
@test isa(c, Float64)
@test_approx_eq c Cxy[1,1]

C = zm ? cor(x1, Y; vardim=vd, mean=0) : cor(x1, Y; vardim=vd)
@test size(C) == (1, k)
@test_approx_eq C Cxy[1,:]

C = zm ? cor(X, y1; vardim=vd, mean=0) : cor(X, y1; vardim=vd)
@test size(C) == (k, 1)
@test_approx_eq C Cxy[:,1]

C = zm ? cor(X, Y; vardim=vd, mean=0) : cor(X, Y; vardim=vd)
@test size(C) == (k, k)
@test_approx_eq C Cxy
end



# test hist

@test sum(hist([1,2,3])[2]) == 3
@test hist([])[2] == []
@test hist([1])[2] == [1]
@@ -34,10 +189,6 @@
@test hist([1,1,1,1,1])[2][1] == 5
@test sum(hist2d(rand(100, 2))[3]) == 100

A = Complex128[exp(i*im) for i in 1:10^4]
@test_approx_eq varm(A,0.) sum(map(abs2,A))/(length(A)-1)
@test_approx_eq varm(A,mean(A)) var(A)

@test midpoints(1.0:1.0:10.0) == 1.5:1.0:9.5
@test midpoints(1:10) == 1.5:9.5
@test midpoints(Float64[1.0:1.0:10.0]) == Float64[1.5:1.0:9.5]
@@ -46,8 +197,4 @@ A = Complex128[exp(i*im) for i in 1:10^4]
@test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3])
@test quantile([0.:100.],[.1,.2,.3,.4,.5,.6,.7,.8,.9])[1] == 10.0

# Test covariance
X = [1 0; 2 1; 3 0; 4 1; 5 10]
y = [5, 3, 4, 2, 5]
@test_approx_eq cov(X[:,1], X[:,2]) cov(X)[1,2]
@test issym(cov(X))

0 comments on commit 5c72672

Please sign in to comment.