diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74e4c14cd..433237ecc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.0' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: diff --git a/Project.toml b/Project.toml index 5e6bffed1..076dd40d4 100644 --- a/Project.toml +++ b/Project.toml @@ -24,7 +24,7 @@ Missings = "0.3, 0.4, 1.0" SortingAlgorithms = "0.3, 1.0" Statistics = "1" StatsAPI = "1.2" -julia = "1" +julia = "1.6" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" diff --git a/src/pairwise.jl b/src/pairwise.jl index de1d47544..96a204b57 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,25 +1,53 @@ -function _pairwise!(::Val{:none}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - @inbounds for (i, xi) in enumerate(x), (j, yj) in enumerate(y) - symmetric && i > j && continue - - # For performance, diagonal is special-cased - if f === cor && eltype(dest) !== Union{} && i == j && xi === yj - # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 - dest[i, j] = float(cor(xi)) - else - dest[i, j] = f(xi, yj) - end +function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, + symmetric::Bool) where {V} + + nr, nc = size(dest) + + # Swap x and y for more efficient threaded loop. + if nc < nr + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:none), f, dest2, y, x, symmetric) + dest .= transpose(dest2) + return dest end - if symmetric - m, n = size(dest) - @inbounds for j in 1:n, i in (j+1):m - dest[i, j] = dest[j, i] + + #cor and friends are special-cased. + diag_is_1 = (f in (corkendall, corspearman, cor)) + (diag_is_1 || f == cov) && (symmetric = x === y) + #cov(x) is faster than cov(x, x) + (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) + + Threads.@threads for subset in EqualSumSubsets(nc, Threads.nthreads()) + for j in subset + for i = (symmetric ? j : 1):nr + # For performance, diagonal is special-cased + if diag_is_1 && i == j && x[i] === y[j] + dest[i, j] = V === Missing ? missing : 1.0 + else + dest[i, j] = f(x[i], y[j]) + end + end end end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end -function check_vectors(x, y, skipmissing::Symbol) +#Input validation for both pairwise and pairwise! +function check_vectors(x, y, skipmissing::Symbol, symmetric::Bool) + + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + if !(skipmissing in (:none, :pairwise, :listwise)) + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + #When skipmissing is :none, elements of x/y can have unequal length. + skipmissing == :none && return + m = length(x) n = length(y) if !(all(xi -> xi isa AbstractVector, x) && all(yi -> yi isa AbstractVector, y)) @@ -46,76 +74,59 @@ function check_vectors(x, y, skipmissing::Symbol) end end -function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - check_vectors(x, y, :pairwise) - @inbounds for (j, yj) in enumerate(y) - ynminds = .!ismissing.(yj) - @inbounds for (i, xi) in enumerate(x) - symmetric && i > j && continue +function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetric::Bool) where {V} - if xi === yj - ynm = view(yj, ynminds) - # For performance, diagonal is special-cased - if f === cor && eltype(dest) !== Union{} && i == j - # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 - dest[i, j] = float(cor(xi)) + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + + # Swap x and y for more efficient threaded loop. + if nc < nr + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest2, y, x, symmetric) + dest .= transpose(dest2) + return dest + end + + #cor and friends are special-cased. + diag_is_1 = (f in (corkendall, corspearman, cor)) + (diag_is_1 || f == cov) && (symmetric = x === y) + #cov(x) is faster than cov(x, x) + (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) + + nmtx = promoted_nmtype(x)[] + nmty = promoted_nmtype(y)[] + + Threads.@threads for subset in EqualSumSubsets(nc, Threads.nthreads()) + scratch_fx = task_local_vector(:scratch_fx, nmtx, m) + scratch_fy = task_local_vector(:scratch_fy, nmty, m) + for j in subset + for i = (symmetric ? j : 1):nr + if diag_is_1 && i == j && x[i] === y[j] && V !== Missing + dest[i, j] = 1.0 else - dest[i, j] = f(ynm, ynm) + dest[i, j] = f(handle_pairwise(x[i], y[j]; scratch_fx=scratch_fx, scratch_fy=scratch_fy)...) end - else - nminds = .!ismissing.(xi) .& ynminds - xnm = view(xi, nminds) - ynm = view(yj, nminds) - dest[i, j] = f(xnm, ynm) end end end - if symmetric - m, n = size(dest) - @inbounds for j in 1:n, i in (j+1):m - dest[i, j] = dest[j, i] - end - end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end function _pairwise!(::Val{:listwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - check_vectors(x, y, :listwise) - nminds = .!ismissing.(first(x)) - @inbounds for xi in Iterators.drop(x, 1) - nminds .&= .!ismissing.(xi) - end - if x !== y - @inbounds for yj in y - nminds .&= .!ismissing.(yj) - end - end - - # Computing integer indices once for all vectors is faster - nminds′ = findall(nminds) - # TODO: check whether wrapping views in a custom array type which asserts - # that entries cannot be `missing` (similar to `skipmissing`) - # could offer better performance - return _pairwise!(Val(:none), f, dest, - [view(xi, nminds′) for xi in x], - [view(yi, nminds′) for yi in y], - symmetric) + return _pairwise!(Val(:none), f, dest, handle_listwise(x, y)..., symmetric) end -function _pairwise!(f, dest::AbstractMatrix, x, y; - symmetric::Bool=false, skipmissing::Symbol=:none) - if !(skipmissing in (:none, :pairwise, :listwise)) - throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) - end +function _pairwise!(f, dest::AbstractMatrix, x, y; symmetric::Bool=false, + skipmissing::Symbol=:none) - x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) - y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + x′ = x isa Union{AbstractArray,Tuple,NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray,Tuple,NamedTuple} ? y : collect(y) m = length(x′) n = length(y′) size(dest) != (m, n) && throw(DimensionMismatch("dest has dimensions $(size(dest)) but expected ($m, $n)")) - Base.has_offset_axes(dest) && throw("dest indices must start at 1") return _pairwise!(Val(skipmissing), f, dest, x′, y′, symmetric) @@ -134,7 +145,7 @@ end # Identical to `Base.promote_typejoin` except that it uses `promote_type` # instead of `typejoin` to combine members of `Union` types -function promote_type_union(::Type{T}) where T +function promote_type_union(::Type{T}) where {T} if T === Union{} return Union{} elseif T isa UnionAll @@ -149,14 +160,14 @@ function promote_type_union(::Type{T}) where T end function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing} - x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) - y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + x′ = x isa Union{AbstractArray,Tuple,NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray,Tuple,NamedTuple} ? y : collect(y) m = length(x′) n = length(y′) - T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)}) - Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)), - Tuple{eltype(x′), eltype(y′)}) + T = Core.Compiler.return_type(f, Tuple{eltype(x′),eltype(y′)}) + Tsm = Core.Compiler.return_type((x, y) -> f(handle_pairwise(x, y)...), + Tuple{eltype(x′),eltype(y′)}) if skipmissing === :none dest = Matrix{T}(undef, m, n) @@ -178,7 +189,7 @@ function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmiss # but using `promote_type` rather than `promote_typejoin`) U = mapreduce(typeof, promote_type, dest) # V is inferred (contrary to U), but it only gives an upper bound for U - V = promote_type_union(Union{T, Tsm}) + V = promote_type_union(Union{T,Tsm}) return convert(Matrix{U}, dest)::Matrix{<:V} end end @@ -193,15 +204,15 @@ entries in `x` and columns to entries in `y`, and `dest` must therefore be of size `length(x) × length(y)`. If `y` is omitted then `x` is crossed with itself. -As a special case, if `f` is `cor`, diagonal cells for which entries -from `x` and `y` are identical (according to `===`) are set to one even -in the presence `missing`, `NaN` or `Inf` entries. +As a special case, if `f` is `cor`, `corspearman` or `corkendall`, diagonal cells for +which entries from `x` and `y` are identical (according to `===`) are set to one even in the +presence `missing`, `NaN` or `Inf` entries. # Keyword arguments - `symmetric::Bool=false`: If `true`, `f` is only called to compute for the lower triangle of the matrix, and these values are copied - to fill the upper triangle. Only allowed when `y` is omitted. - Defaults to `true` when `f` is `cor` or `cov`. + to fill the upper triangle. Only allowed when `y` is omitted and ignored (taken as `true`) + if `f` is `cov`, `cor`, `corkendall` or `corspearman`. - `skipmissing::Symbol=:none`: If `:none` (the default), missing values in inputs are passed to `f` without any modification. Use `:pairwise` to skip entries with a `missing` value in either @@ -211,7 +222,7 @@ in the presence `missing`, `NaN` or `Inf` entries. Only allowed when entries in `x` and `y` are vectors. # Examples -```jldoctest +```julia-repl julia> using StatsBase, Statistics julia> dest = zeros(3, 3); @@ -244,12 +255,8 @@ julia> dest ``` """ function pairwise!(f, dest::AbstractMatrix, x, y=x; - symmetric::Bool=false, skipmissing::Symbol=:none) - if symmetric && x !== y - throw(ArgumentError("symmetric=true only makes sense passing " * - "a single set of variables (x === y)")) - end - + symmetric::Bool=false, skipmissing::Symbol=:none) + check_vectors(x, y, skipmissing, symmetric) return _pairwise!(f, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) end @@ -262,15 +269,15 @@ of entries in iterators `x` and `y`. Rows correspond to entries in `x` and columns to entries in `y`. If `y` is omitted then a square matrix crossing `x` with itself is returned. -As a special case, if `f` is `cor`, diagonal cells for which entries -from `x` and `y` are identical (according to `===`) are set to one even -in the presence `missing`, `NaN` or `Inf` entries. +As a special case, if `f` is `cor`, `corspearman` or `corkendall`, diagonal cells for +which entries from `x` and `y` are identical (according to `===`) are set to one even in the +presence `missing`, `NaN` or `Inf` entries. # Keyword arguments - `symmetric::Bool=false`: If `true`, `f` is only called to compute for the lower triangle of the matrix, and these values are copied - to fill the upper triangle. Only allowed when `y` is omitted. - Defaults to `true` when `f` is `cor` or `cov`. + to fill the upper triangle. Only allowed when `y` is omitted and ignored (taken as `true`) + if `f` is `cov`, `cor`, `corkendall` or `corspearman`. - `skipmissing::Symbol=:none`: If `:none` (the default), missing values in inputs are passed to `f` without any modification. Use `:pairwise` to skip entries with a `missing` value in either @@ -280,7 +287,7 @@ in the presence `missing`, `NaN` or `Inf` entries. Only allowed when entries in `x` and `y` are vectors. # Examples -```jldoctest +```julia-repl julia> using StatsBase, Statistics julia> x = [1 3 7 @@ -307,34 +314,197 @@ julia> pairwise(cor, eachcol(y), skipmissing=:pairwise) ``` """ function pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) - if symmetric && x !== y - throw(ArgumentError("symmetric=true only makes sense passing " * - "a single set of variables (x === y)")) + check_vectors(x, y, skipmissing, symmetric) + return _pairwise(Val(skipmissing), f, x, y, symmetric) +end + +# Auxiliary functions for pairwise + +promoted_type(x) = mapreduce(eltype, promote_type, x, init=Union{}) +promoted_nmtype(x) = mapreduce(nonmissingtype ∘ eltype, promote_type, x, init=Union{}) + +""" + handle_listwise(x, y) + +Remove missings in a listwise manner. Assumes `x` and `y` are vectors of iterables which +have been validated via `check_vectors`. + +## Example +```julia-repl +julia> a = [6,7,8,9,10,missing];b = [4,5,6,7,missing,8];c = [2,3,4,missing,5,6];d = [1,2,3,4,5,6]; + +julia> StatsBase.handle_listwise([a,b],[c,d]) +([[6, 7, 8], [4, 5, 6]], [[2, 3, 4], [1, 2, 3]]) +``` +""" +function handle_listwise(x, y) + if !(missing isa promoted_type(x) || missing isa promoted_type(y)) + return x, y end - return _pairwise(Val(skipmissing), f, x, y, symmetric) + nminds = .!ismissing.(first(x)) + @inbounds for xi in Iterators.drop(x, 1) + nminds .&= .!ismissing.(xi) + end + if x !== y + @inbounds for yj in y + nminds .&= .!ismissing.(yj) + end + end + + # Computing integer indices once for all vectors is faster + nminds′ = findall(nminds) + # TODO: check whether wrapping views in a custom array type which asserts + # that entries cannot be `missing` (similar to `skipmissing`) + # could offer better performance + + x′ = [disallowmissing(view(xi, nminds′)) for xi in x] + if x === y + return x′, x′ + else + y′ = [disallowmissing(view(yi, nminds′)) for yi in y] + return x′, y′ + end +end + +""" + handle_pairwise(x::AbstractVector, y::AbstractVector; + scratch_fx::AbstractVector=similar(x, nonmissingtype(eltype(x))), + scratch_fy::AbstractVector=similar(y, nonmissingtype(eltype(y)))) + +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and +`y[i]` are excluded if `ismissing(x[i])||ismissing(y[i])`. +""" +function handle_pairwise(x::AbstractVector, y::AbstractVector; + scratch_fx::AbstractVector=similar(x, nonmissingtype(eltype(x))), + scratch_fy::AbstractVector=similar(y, nonmissingtype(eltype(y)))) + + axes(x) == axes(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) + lb = first(axes(x, 1)) + j = lb + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + scratch_fx[j] = x[i] + scratch_fy[j] = y[i] + j += 1 + end + end + + return view(scratch_fx, lb:(j-1)), view(scratch_fy, lb:(j-1)) end -# cov(x) is faster than cov(x, x) -_cov(x, y) = x === y ? cov(x) : cov(x, y) +""" + task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} + +Retrieve from task local storage a vector of length `length` and matching the element +type of `similarto`, with initialisation on first call during a task. +""" +function task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} + haskey(task_local_storage(), key) || task_local_storage(key, similar(similarto, length)) + return task_local_storage(key) +end + +""" + EqualSumSubsets + +An iterator that partitions the integers 1 to n into `num_subsets` "subsets" (of type +TwoStepRange) such that a) each subset is of nearly equal length; and b) the sum of the +elements in each subset is nearly equal. If `n` is a multiple of `2 * num_subsets` both +conditions hold exactly. + +## Example +```julia-repl +julia> for s in StatsBase.EqualSumSubsets(30,5) +println((collect(s), sum(s))) +end +([30, 21, 20, 11, 10, 1], 93) +([29, 22, 19, 12, 9, 2], 93) +([28, 23, 18, 13, 8, 3], 93) +([27, 24, 17, 14, 7, 4], 93) +([26, 25, 16, 15, 6, 5], 93) + +#Check for correct partitioning, in this case of integers 1:1000 into 17 subsets. +julia> sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(1000,17)]...))==1:1000 +true -pairwise!(::typeof(cov), dest::AbstractMatrix, x, y; - symmetric::Bool=false, skipmissing::Symbol=:none) = - pairwise!(_cov, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) +``` +""" +struct EqualSumSubsets + n::Int + num_subsets::Int + + function EqualSumSubsets(n, num_subsets) + n >= 0 || throw("n must not be negative, but got $n") + num_subsets > 0 || throw("num_subsets must be positive, but got $num_subsets") + return new(n, num_subsets) + end -pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) = - pairwise(_cov, x, y, symmetric=symmetric, skipmissing=skipmissing) +end -pairwise!(::typeof(cov), dest::AbstractMatrix, x; - symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise!(_cov, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) +Base.eltype(::EqualSumSubsets) = TwoStepRange +Base.length(x::EqualSumSubsets) = min(x.n, x.num_subsets) +Base.firstindex(::EqualSumSubsets) = 1 -pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise(_cov, x, x, symmetric=symmetric, skipmissing=skipmissing) +function Base.iterate(ess::EqualSumSubsets, state::Int=1) + state > length(ess) && return nothing + return getindex(ess, state), state + 1 +end -pairwise!(::typeof(cor), dest::AbstractMatrix, x; - symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise!(cor, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) +function Base.getindex(ess::EqualSumSubsets, i::Int=1) + n, nss = ess.n, ess.num_subsets + step1 = 2i - 2nss - 1 + step2 = 1 - 2i + return TwoStepRange(n - i + 1, step1, step2) +end + +""" +TwoStepRange + +Range with a starting value of `start`, stop value of `1` and a step that alternates +between `step1` and `step2`. `start` must be positive and `step1` and `step2` must be +negative. + +# Examples +```julia-repl +julia> collect(StatsBase.TwoStepRange(30,-7,-3)) +6-element Vector{Int64}: + 30 + 23 + 20 + 13 + 10 + 3 + +``` +""" +struct TwoStepRange + start::Int + step1::Int + step2::Int + + function TwoStepRange(start, step1, step2) + start > 0 || throw("start must be positive, but got $start") + step1 < 0 || throw("step1 must be negative, but got $step1") + step2 < 0 || throw("step2 must be negative, but got $step2") + return new(start, step1, step2) + end +end + +Base.eltype(::TwoStepRange) = Int + +function Base.length(tsr::TwoStepRange) + return length((tsr.start):(tsr.step1+tsr.step2):1) + + length((tsr.start+tsr.step1):(tsr.step1+tsr.step2):1) +end + +function Base.iterate(tsr::TwoStepRange, i::Int=1) + (i > length(tsr)) && return nothing + return getindex(tsr, i), i + 1 +end -pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing) +function Base.getindex(tsr::TwoStepRange, i::Int=1)::Int + a, b = divrem(i - 1, 2) + return tsr.start + (tsr.step1 + tsr.step2) * a + b * tsr.step1 +end \ No newline at end of file diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 95c5cb03e..eb9c4c211 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -11,110 +11,386 @@ ####################################### """ - corspearman(x, y=x) + corspearman(x, y=x; skipmissing::Symbol=:none) Compute Spearman's rank correlation coefficient. If `x` and `y` are vectors, the output is a float, otherwise it's a matrix corresponding to the pairwise correlations of the columns of `x` and `y`. + +Uses multiple threads when either `x` or `y` is a matrix and `skipmissing` is `:pairwise`. + +# Keyword argument + +- `skipmissing::Symbol=:none`: If `:none` (the default), `missing` entries in `x` or `y` + give rise to `missing` entries in the return. If `:pairwise` when calculating an element + of the return, both `i`th entries of the input vectors are skipped if either is missing. + If `:listwise` the `i`th rows of both `x` and `y` are skipped if `missing` appears in + either; note that this might skip a high proportion of entries. Only allowed when `x` or + `y` is a matrix. """ -function corspearman(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) - n = length(x) - n == length(y) || throw(DimensionMismatch("vectors must have same length")) - (any(isnan, x) || any(isnan, y)) && return NaN - return cor(tiedrank(x), tiedrank(y)) +function corspearman(x::AbstractVector, y::AbstractVector; skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, false) + if x === y + return corspearman(x) + else + return corspearman_kernel!(x, y, skipmissing) + end end -function corspearman(X::AbstractMatrix{<:Real}, y::AbstractVector{<:Real}) - size(X, 1) == length(y) || - throw(DimensionMismatch("X and y have inconsistent dimensions")) - n = size(X, 2) - C = Matrix{Float64}(I, n, 1) - any(isnan, y) && return fill!(C, NaN) - yrank = tiedrank(y) - for j = 1:n - Xj = view(X, :, j) - if any(isnan, Xj) - C[j,1] = NaN - else - Xjrank = tiedrank(Xj) - C[j,1] = cor(Xjrank, yrank) - end +function corspearman(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) + return corspearman(x, reshape(y, (length(y), 1)); skipmissing=skipmissing) +end + +function corspearman(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) + return corspearman(reshape(x, (length(x), 1)), y; skipmissing=skipmissing) +end + +function corspearman(x::AbstractMatrix, y::AbstractMatrix=x; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, true) + if x === y + return pairwise(corspearman, _eachcol(x); skipmissing=skipmissing) + else + return pairwise(corspearman, _eachcol(x), _eachcol(y); skipmissing=skipmissing) end - return C end -function corspearman(x::AbstractVector{<:Real}, Y::AbstractMatrix{<:Real}) - size(Y, 1) == length(x) || - throw(DimensionMismatch("x and Y have inconsistent dimensions")) - n = size(Y, 2) - C = Matrix{Float64}(I, 1, n) - any(isnan, x) && return fill!(C, NaN) - xrank = tiedrank(x) - for j = 1:n - Yj = view(Y, :, j) - if any(isnan, Yj) - C[1,j] = NaN - else - Yjrank = tiedrank(Yj) - C[1,j] = cor(xrank, Yjrank) +function corspearman(x::AbstractVector{T}) where {T} + return T === Missing ? missing : 1.0 +end + +function _pairwise!(::Val{:listwise}, f::typeof(corspearman), dest::AbstractMatrix, x, y, + symmetric::Bool) + return _pairwise!(Val(:none), f, dest, handle_listwise(x, y)..., symmetric) +end + +function _pairwise!(::Val{:none}, f::typeof(corspearman), + dest::AbstractMatrix, x, y, symmetric::Bool) + + symmetric = x === y + if symmetric && promoted_type(x) === Missing + offdiag = (length(x[1]) < 2) ? NaN : missing + @inbounds for i in axes(dest, 1), j in axes(dest, 2) + dest[i, j] = i == j ? missing : offdiag end + return dest end - return C + + ranksx = ranks_matrix(x) + if symmetric + dest .= _cor(ranksx, ranksx) + else + ranksy = ranks_matrix(y) + dest .= _cor(ranksx, ranksy) + end + + #=When elements x[i] and y[i] are identical (according to `===`) then dest[i,i] should + be 1.0 even in the presence of missing and NaN values. But the return from `_cor` does + not respect that requirement. So amend.=# + autocor = (eltype(dest) === Missing && skipmissing == :none) ? missing : 1.0 + @inbounds for i in 1:min(size(dest, 1), size(dest, 2)) + x[i] === y[i] && (dest[i, i] = autocor) + end + return dest end -function corspearman(X::AbstractMatrix{<:Real}) - n = size(X, 2) - C = Matrix{Float64}(I, n, n) - anynan = Vector{Bool}(undef, n) - for j = 1:n - Xj = view(X, :, j) - anynan[j] = any(isnan, Xj) - if anynan[j] - C[:,j] .= NaN - C[j,:] .= NaN - C[j,j] = 1 - continue +function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), + dest::AbstractMatrix{V}, x, y, symmetric::Bool) where {V} + + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + symmetric = x === y + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest2, y, x, symmetric) + dest .= transpose(dest2) + return dest + end + + tempx = sortperm_matrix(x) + tempy = symmetric ? tempx : sortperm_matrix(y) + + int64 = Int64[] + fl64 = Float64[] + nmtx = promoted_nmtype(x)[] + nmty = promoted_nmtype(y)[] + Threads.@threads for subset in EqualSumSubsets(nr, Threads.nthreads()) + + for i in subset + + inds = task_local_vector(:inds, int64, m) + spnmx = task_local_vector(:spnmx, int64, m) + spnmy = task_local_vector(:spnmy, int64, m) + nmx = task_local_vector(:nmx, nmtx, m) + nmy = task_local_vector(:nmy, nmty, m) + ranksx = task_local_vector(:ranksx, fl64, m) + ranksy = task_local_vector(:ranksy, fl64, m) + + for j = 1:(symmetric ? i : nc) + # For performance, diagonal is special-cased + if x[i] === y[j] && i == j + if missing isa V && eltype(x[i]) == Missing + dest[i, j] = missing + else + dest[i, j] = 1.0 + end + else + dest[i, j] = corspearman_kernel!(x[i], y[j], :pairwise, + view(tempx, :, i), view(tempy, :, j), inds, spnmx, spnmy, nmx, + nmy, ranksx, ranksy) + end + end end - Xjrank = tiedrank(Xj) - for i = 1:(j-1) - Xi = view(X, :, i) - if anynan[i] - C[i,j] = C[j,i] = NaN + end + symmetric && LinearAlgebra.copytri!(dest, 'L') + return dest +end + +""" + corspearman_kernel!(x::AbstractVector{T}, y::AbstractVector{U}, + skipmissing::Symbol, sortpermx=sortperm(x), sortpermy=sortperm(y), + inds=zeros(Int64, length(x)), spnmx=zeros(Int64, length(x)), + spnmy=zeros(Int64, length(x)), nmx=similar(x, nonmissingtype(T)), + nmy=similar(y, nonmissingtype(U)), ranksx=similar(x, Float64), + ranksy=similar(y, Float64)) where {T,U} + +Compute Spearman's rank correlation coefficient between `x` and `y` with no allocations if +all arguments are provided. + +All vector arguments must have the same axes. +- `sortpermx`: The sort permutation of `x`. +- `sortpermy`: The sort permutation of `y`. +- `inds` ... `ranksy`: Pre-allocated "scratch" arguments. + +## Example +```julia-repl +julia> using StatsBase, BenchmarkTools, Random + +julia> Random.seed!(1); + +julia> x = ifelse.(rand(1000) .< 0.05,missing,randn(1000));y = ifelse.(rand(1000) .< 0.05, missing,randn(1000)); + +julia> sortpermx=sortperm(x);sortpermy=sortperm(y);inds=zeros(Int64,1000);spnmx=zeros(Int64,1000);spnmy=zeros(Int64,1000); + +julia> nmx=zeros(1000);nmy=zeros(1000);ranksx=similar(x,Float64);ranksy=similar(y,Float64); + +julia> @btime corspearman_kernel!(\$x,\$y,:pairwise,\$sortpermx,\$sortpermy,\$inds,\$spnmx,\$spnmy,\$nmx,\$nmy,\$ranksx,\$ranksy) +4.671 μs (0 allocations: 0 bytes) +-0.016058512110609713 +``` +""" +function corspearman_kernel!(x::AbstractVector{T}, y::AbstractVector{U}, + skipmissing::Symbol, sortpermx=sortperm(x), sortpermy=sortperm(y), + inds=zeros(Int64, length(x)), spnmx=zeros(Int64, length(x)), + spnmy=zeros(Int64, length(x)), nmx=similar(x, nonmissingtype(T)), + nmy=similar(y, nonmissingtype(U)), ranksx=similar(x, Float64), + ranksy=similar(y, Float64)) where {T,U} + + (axes(x) == axes(sortpermx) == axes(y) == axes(sortpermy) == axes(inds) == + axes(spnmx) == axes(spnmy) == axes(nmx) == axes(nmy) == axes(ranksx) == + axes(ranksy)) || throw(ArgumentError("Axes of inputs must match")) + + if skipmissing == :pairwise + + lb = first(axes(x, 1)) + k = lb + #= Process (x,y) to obtain (nmx,nmy) by filtering out elements at position k if + either x[k] or y[k] is missing. inds provides the mapping of elements of x (or y) to + elements of nmx (or nmy) i.e. x[k] maps to nmx[inds[k]]. inds is then used to obtain + spnmx and spnmy more efficiently than calling sortperm(nmx) and sortperm(nmy). + =# + @inbounds for i in axes(x, 1) + if !(ismissing(x[i]) || ismissing(y[i])) + inds[i] = k + nmx[k] = x[i] + nmy[k] = y[i] + k += 1 else - Xirank = tiedrank(Xi) - C[i,j] = C[j,i] = cor(Xjrank, Xirank) + inds[i] = lb - 1 end end + + nnm = k - 1 + if nnm <= 1 + return NaN + end + nmx = view(nmx, lb:nnm) + nmy = view(nmy, lb:nnm) + + if any(_isnan, nmx) || any(_isnan, nmy) + return NaN + end + + k = lb + @inbounds for i in axes(x, 1) + if (inds[sortpermx[i]]) != lb - 1 + spnmx[k] = inds[sortpermx[i]] + k += 1 + end + end + spnmx = view(spnmx, lb:nnm) + + k = lb + @inbounds for i in axes(y, 1) + if (inds[sortpermy[i]]) != lb - 1 + spnmy[k] = inds[sortpermy[i]] + k += 1 + end + end + spnmy = view(spnmy, lb:nnm) + + if nnm <= 1 + return NaN + end + + _tiedrank!(view(ranksx, 1:nnm), nmx, spnmx) + _tiedrank!(view(ranksy, 1:nnm), nmy, spnmy) + + return cor(view(ranksx, 1:nnm), view(ranksy, 1:nnm)) + + else + if length(x) <= 1 + return NaN + elseif skipmissing == :none && (missing isa T || missing isa U) && + (any(ismissing, x) || any(ismissing, y)) + return missing + elseif any(_isnan, x) || any(_isnan, y) + return NaN + end + + _tiedrank!(ranksx, x, sortpermx) + _tiedrank!(ranksy, y, sortpermy) + return cor(ranksx, ranksy) end - return C end -function corspearman(X::AbstractMatrix{<:Real}, Y::AbstractMatrix{<:Real}) - size(X, 1) == size(Y, 1) || - throw(ArgumentError("number of rows in each array must match")) - nr = size(X, 2) - nc = size(Y, 2) - C = Matrix{Float64}(undef, nr, nc) - for j = 1:nr - Xj = view(X, :, j) - if any(isnan, Xj) - C[j,:] .= NaN - continue +# Auxiliary functions for Spearman's rank correlation + +""" + _cor(x, y) +Work-around various "unhelpful" features of cor: +a) Ensures that on-diagonal elements of the return are as they should be in the symmetric +case i.e. 1.0 unless eltype(x) is Missing in which case on-diagonal elements should be +missing. +b) Ensure that _cor(a,b) is NaN when a and b are vectors of equal length less than 2 +c) Works around some edge-case bugs in cor's handling of `missing` where the function throws +if `x` and `y` are matrices but nevertheless looping around the columns of `x` and `y` +works. https://github.com/JuliaStats/Statistics.jl/issues/63 + +# Example +```julia-repl +julia> x = y = fill(missing,2,2) +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> Statistics.cor(x,y) +ERROR: MethodError: no method matching copy(::Missing) + +julia> StatsBase._cor(x,y) +2×2 Matrix{Union{Missing, Float64}}: + 1.0 missing + missing 1.0 + +julia> + +``` +""" +function _cor(ranksx::AbstractMatrix{T}, ranksy::AbstractMatrix{U}) where {T,U} + symmetric = ranksx === ranksy + + if size(ranksx, 1) < 2 + if symmetric && T === Missing + return ifelse.(axes(ranksx, 2) .== axes(ranksx, 2)', missing, NaN) + elseif symmetric + return ifelse.(axes(ranksx, 2) .== axes(ranksx, 2)', 1.0, NaN) + else + return fill(NaN, size(ranksx, 2), size(ranksy, 2)) end - Xjrank = tiedrank(Xj) - for i = 1:nc - Yi = view(Y, :, i) - if any(isnan, Yi) - C[j,i] = NaN - else - Yirank = tiedrank(Yi) - C[j,i] = cor(Xjrank, Yirank) + end + try + if symmetric + return cor(ranksx) + else + return cor(ranksx, ranksy) + end + catch + #=This catch block is hit when e.g. + ranksx === ranksy = [missing missing;missing missing] + =# + nr, nc = size(ranksx, 2), size(ranksy, 2) + if ranksx === ranksy && T === Missing + return fill(missing, nr, nc) + elseif missing isa T || missing isa U + C = ones(Union{Missing,Float64}, nr, nc) + else + C = ones(Float64, nr, nc) + end + + for j = (symmetric ? 2 : 1):nr + for i = 1:(symmetric ? j - 1 : nc) + C[j, i] = cor(view(ranksx, :, j), view(ranksy, :, i)) + symmetric && (C[i, j] = C[j, i]) end end + return C + end +end + +""" + sortperm_matrix(x) + +Given `x`, a vector of vectors, return a matrix who's ith column is the sort permutation of +the ith element of x. +""" +function sortperm_matrix(x) + m = length(x) == 0 ? 0 : length(first(x)) + nc = length(x) + int64 = Int64[] + out = Array{Int}(undef, m, nc) + + Threads.@threads for i in 1:nc + ints = task_local_vector(:ints, int64, m) + sortperm!(ints, x[i]) + out[:, i] .= ints end - return C + return out end +""" + ranks_matrix(x) + +Given `x`, a vector of vectors, return a matrix such that the (Pearson) correlaton between +columns of the return is the Spearman rank correlation between the elements of x. +""" +function ranks_matrix(x) + + m = length(x) == 0 ? 0 : length(first(x)) + nc = length(x) + int64 = Int64[] + + if promoted_type(x) === Missing + return fill(missing, m, nc) + end + + out = Array{Union{Missing,Int,Float64}}(undef, m, nc) + + Threads.@threads for i in 1:nc + ints = task_local_vector(:ints, int64, m) + + if any(ismissing, x[i]) + out[:, i] .= missing + elseif any(_isnan, x[i]) + out[:, i] .= NaN + else + sortperm!(ints, x[i]) + _tiedrank!(view(out, :, i), x[i], ints) + end + end + return out +end ####################################### # @@ -122,17 +398,205 @@ end # ####################################### +""" + corkendall(x, y=x; skipmissing::Symbol=:none) + +Compute Kendall's rank correlation coefficient, τ. `x` and `y` must be either vectors or +matrices, and entries may be `missing`. + +Uses multiple threads when either `x` or `y` is a matrix. + +# Keyword argument + +- `skipmissing::Symbol=:none`: If `:none` (the default), `missing` entries in `x` or `y` + give rise to `missing` entries in the return. If `:pairwise` when calculating an element + of the return, both `i`th entries of the input vectors are skipped if either is missing. + If `:listwise` the `i`th rows of both `x` and `y` are skipped if `missing` appears in + either; note that this might skip a high proportion of entries. Only allowed when `x` or + `y` is a matrix. +""" +function corkendall(x::AbstractMatrix, y::AbstractMatrix=x; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, true) + if x === y + return pairwise(corkendall, _eachcol(x); skipmissing=skipmissing) + else + return pairwise(corkendall, _eachcol(x), _eachcol(y); skipmissing=skipmissing) + end +end + +function corkendall(x::AbstractVector, y::AbstractVector; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, false) + if x === y + return corkendall(x) + else + x = copy(x) + permx = sortperm(x) + permute!(x, permx) + return corkendall_kernel!(x, y, permx, skipmissing) + end +end + +#= corkendall returns a vector in this case, inconsistent with with Statistics.cor and +StatsBase.corspearman, but consistent with StatsBase.corkendall. + =# +function corkendall(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) + return vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing=skipmissing)) +end + +function corkendall(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) + return corkendall(reshape(x, (length(x), 1)), y; skipmissing=skipmissing) +end + +function corkendall(x::AbstractVector{T}) where {T} + return T === Missing ? missing : 1.0 +end + +function _pairwise!(::Val{:none}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:none, f, dest, x, y, symmetric) +end + +function _pairwise!(::Val{:pairwise}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:pairwise, f, dest, x, y, symmetric) +end + +function _pairwise!(::Val{:listwise}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:none, f, dest, handle_listwise(x, y)..., symmetric) +end + +function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::AbstractMatrix{V}, + x, y, symmetric::Bool) where {V} + + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + corkendall_loop!(skipmissing, f, dest2, y, x, symmetric) + dest .= transpose(dest2) + return dest + end + + intvec = Int[] + t = promoted_type(x)[] + u = promoted_type(y)[] + t′ = promoted_nmtype(x)[] + u′ = promoted_nmtype(y)[] + + symmetric = x === y + + Threads.@threads for subset in EqualSumSubsets(nr, Threads.nthreads()) + + for i in subset + + sortedxj = task_local_vector(:sortedxj, t, m) + scratch_py = task_local_vector(:scratch_py, u, m) + yj = task_local_vector(:yj, u, m) + permx = task_local_vector(:permx, intvec, m) + # Ensuring missing is not an element type of scratch_sy, scratch_fx, scratch_fy + # gives improved performance. + scratch_sy = task_local_vector(:scratch_sy, u′, m) + scratch_fx = task_local_vector(:scratch_fx, t′, m) + scratch_fy = task_local_vector(:scratch_fy, t′, m) + + sortperm!(permx, x[i]) + @inbounds for k in eachindex(sortedxj) + sortedxj[k] = x[i][permx[k]] + end + + for j = 1:(symmetric ? i : nc) + # For performance, diagonal is special-cased + if x[i] === y[j] && j == i + if missing isa V && eltype(x[i]) == Missing + dest[i, j] = missing + else + dest[i, j] = 1.0 + end + else + yj .= y[j] + dest[i, j] = corkendall_kernel!(sortedxj, yj, permx, skipmissing; + scratch_py=scratch_py, scratch_sy=scratch_sy, scratch_fx=scratch_fx, + scratch_fy=scratch_fy) + end + end + end + end + symmetric && LinearAlgebra.copytri!(dest, 'L') + return dest +end + +# Auxiliary functions for Kendall's rank correlation + # Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” # Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. # JSTOR, www.jstor.org/stable/2282833. -function corkendall!(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}, permx::AbstractArray{<:Integer}=sortperm(x)) - if any(isnan, x) || any(isnan, y) return NaN end - n = length(x) - if n != length(y) error("Vectors must have same length") end - # Initial sorting - permute!(x, permx) - permute!(y, permx) +function check_rankcor_args(x, y, skipmissing, allowlistwise::Bool) + # Base.require_one_based_indexing(x, y) #TODO find how to reject non-1-based in a way that works in Julia 1.0.5 + size(x, 1) == size(y, 1) || + throw(DimensionMismatch("x and y have inconsistent dimensions")) + if allowlistwise + skipmissing in (:none, :pairwise, :listwise) || + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, but got :$skipmissing")) + else + skipmissing in (:none, :pairwise) || + throw(ArgumentError("skipmissing must be either :none or :pairwise, but got :$skipmissing")) + end +end + +""" + corkendall_kernel!(sortedx::AbstractVector, y::AbstractVector, + permx::AbstractVector{<:Integer}, skipmissing::Symbol; + scratch_py::AbstractVector=similar(y), + scratch_sy::AbstractVector=similar(y), + scratch_fx::AbstractVector=similar(sortedx), + scratch_fy::AbstractVector=similar(y)) + +Kendall correlation between two vectors but omitting the initial sorting of the first +argument. Calculating Kendall correlation between `x` and `y` is thus a two stage process: +a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with +subsequent arguments: +- `permx`: The permutation that sorts `x` to yield `sortedx`. +- `scratch_py`: A vector used to permute `y` without allocation. +- `scratch_sy`: A vector used to sort `y` without allocation. +- `scratch_fx, scratch_fy`: Vectors used to filter `missing`s from `x` and `y` without + allocation. +""" +function corkendall_kernel!(sortedx::AbstractVector{T}, y::AbstractVector{U}, + permx::AbstractVector{<:Integer}, skipmissing::Symbol; + scratch_py::AbstractVector{V}=similar(y), + scratch_sy::AbstractVector=similar(y), + scratch_fx::AbstractVector=similar(sortedx), + scratch_fy::AbstractVector=similar(y)) where {T,U,V} + + length(sortedx) >= 2 || return NaN + + if skipmissing == :none + if missing isa T && any(ismissing, sortedx) + return missing + elseif missing isa U && any(ismissing, y) + return missing + end + end + + @inbounds for i in eachindex(y) + scratch_py[i] = y[permx[i]] + end + + if missing isa T || missing isa V + sortedx, permutedy = handle_pairwise(sortedx, scratch_py; scratch_fx=scratch_fx, scratch_fy=scratch_fy) + else + permutedy = scratch_py + end + + (any(_isnan, sortedx) || any(_isnan, permutedy)) && return NaN + + n = length(sortedx) # Use widen to avoid overflows on both 32bit and 64bit npairs = div(widen(n) * (n - 1), 2) @@ -140,80 +604,35 @@ function corkendall!(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}, permx k = 0 @inbounds for i = 2:n - if x[i - 1] == x[i] + if sortedx[i-1] == sortedx[i] k += 1 elseif k > 0 - # Sort the corresponding chunk of y, so the rows of hcat(x,y) are - # sorted first on x, then (where x values are tied) on y. Hence - # double ties can be counted by calling countties. - sort!(view(y, (i - k - 1):(i - 1))) + #= + Sort the corresponding chunk of permutedy, so rows of hcat(sortedx,permutedy) + are sorted first on sortedx, then (where sortedx values are tied) on permutedy. + Hence double ties can be counted by calling countties. + =# + sort!(view(permutedy, (i-k-1):(i-1))) ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here - ndoubleties += countties(y, i - k - 1, i - 1) + ndoubleties += countties(permutedy, i - k - 1, i - 1) k = 0 end end if k > 0 - sort!(view(y, (n - k):n)) + sort!(view(permutedy, (n-k):n)) ntiesx += div(widen(k) * (k + 1), 2) - ndoubleties += countties(y, n - k, n) + ndoubleties += countties(permutedy, n - k, n) end - nswaps = merge_sort!(y, 1, n) - ntiesy = countties(y, 1, n) + nswaps = merge_sort!(permutedy, 1, n, scratch_sy) + ntiesy = countties(permutedy, 1, n) # Calls to float below prevent possible overflow errors when - # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) - (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / - sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) -end - -""" - corkendall(x, y=x) - -Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either -matrices or vectors. -""" -corkendall(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) = corkendall!(copy(x), copy(y)) - -function corkendall(X::AbstractMatrix{<:Real}, y::AbstractVector{<:Real}) - permy = sortperm(y) - return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)]) -end - -function corkendall(x::AbstractVector{<:Real}, Y::AbstractMatrix{<:Real}) - n = size(Y, 2) - permx = sortperm(x) - return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n)) -end - -function corkendall(X::AbstractMatrix{<:Real}) - n = size(X, 2) - C = Matrix{Float64}(I, n, n) - for j = 2:n - permx = sortperm(X[:,j]) - for i = 1:j - 1 - C[j,i] = corkendall!(X[:,j], X[:,i], permx) - C[i,j] = C[j,i] - end - end - return C + # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + return (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) end -function corkendall(X::AbstractMatrix{<:Real}, Y::AbstractMatrix{<:Real}) - nr = size(X, 2) - nc = size(Y, 2) - C = Matrix{Float64}(undef, nr, nc) - for j = 1:nr - permx = sortperm(X[:,j]) - for i = 1:nc - C[j,i] = corkendall!(X[:,j], Y[:,i], permx) - end - end - return C -end - -# Auxiliary functions for Kendall's rank correlation - """ countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) @@ -224,8 +643,8 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) thistiecount = result = widen(0) checkbounds(x, lo:hi) - @inbounds for i = (lo + 1):hi - if x[i] == x[i - 1] + @inbounds for i = (lo+1):hi + if x[i] == x[i-1] thistiecount += 1 elseif thistiecount > 0 result += div(thistiecount * (thistiecount + 1), 2) @@ -236,7 +655,7 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) if thistiecount > 0 result += div(thistiecount * (thistiecount + 1), 2) end - result + return result end # Tests appear to show that a value of 64 is optimal, @@ -246,12 +665,15 @@ const SMALL_THRESHOLD = 64 # merge_sort! copied from Julia Base # (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) """ - merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. """ -function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) # Use of widen below prevents possible overflow errors when # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) nswaps = widen(0) @@ -261,7 +683,7 @@ function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVec m = midpoint(lo, hi) (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - nswaps = merge_sort!(v, lo, m, t) + nswaps = merge_sort!(v, lo, m, t) nswaps += merge_sort!(v, m + 1, hi, t) i, j = 1, lo @@ -294,25 +716,28 @@ end # insertion_sort! and midpoint copied from Julia Base # (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) +midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) """ insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. """ function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi return widen(0) end + if lo == hi + return widen(0) + end nswaps = widen(0) - @inbounds for i = lo + 1:hi + @inbounds for i = lo+1:hi j = i x = v[i] while j > lo - if x < v[j - 1] + if x < v[j-1] nswaps += 1 - v[j] = v[j - 1] + v[j] = v[j-1] j -= 1 continue end @@ -322,3 +747,15 @@ function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) end return nswaps end + +# Auxiliary functions for both Kendall's and Spearman's rank correlations + +# _isnan required so that corkendall and corspearman have correct handling of NaNs and +# can also accept arguments with element type for which isnan is not defined but isless is +# is defined, so that rank correlation makes sense. +_isnan(x::T) where {T<:Number} = isnan(x) +_isnan(x) = false + +# eachcol was added in Julia 1.1 but for Julia 1.8, keys(eachcol(x)) fails for any Matrix x +# which causes `check_vectors` to fail. Solution is to use the comprehension below. +_eachcol(x) = VERSION < v"1.9" ? [view(x, :, i) for i in axes(x, 2)] : eachcol(x) \ No newline at end of file diff --git a/test/pairwise.jl b/test/pairwise.jl index 59e081b08..24ede4492 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -9,8 +9,14 @@ Random.seed!(1) # to avoid using specialized method arbitrary_fun(x, y) = cor(x, y) -@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov) +@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov, corkendall, corspearman) + + isrankcor = f in (corkendall, corspearman) + iscor = isrankcor || f == cor + throwsforzerolengthinput = f in (arbitrary_fun, cor, cov) + @testset "basic interface" begin + x = [rand(10) for _ in 1:4] y = [rand(Float32, 10) for _ in 1:5] # to test case where inference of returned eltype fails @@ -23,24 +29,33 @@ arbitrary_fun(x, y) = cor(x, y) @test res == res2 == [f(xi, yi) for xi in x, yi in y] res = pairwise(f, y, z) - @test res isa Matrix{Float32} - res2 = zeros(Float32, size(res)) - @test pairwise!(f, res2, y, z) === res2 - @test res == res2 == [f(yi, zi) for yi in y, zi in z] + if isrankcor + @test res isa Matrix{Float64} + @test res == [f(yi, zi) for yi in y, zi in z] + else + @test res isa Matrix{Float32} + res2 = zeros(Float32, size(res)) + @test pairwise!(f, res2, y, z) === res2 + @test res == res2 == [f(yi, zi) for yi in y, zi in z] + end res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) @test res isa Matrix{Float64} res2 = zeros(AbstractFloat, size(res)) @test pairwise!(f, res2, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) === res2 - @test res == res2 == - [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), - yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + + res3 = [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), + yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + @test res == res2 + @test all(isapprox.(res2, res3)) + @test res isa Matrix{Float64} @inferred pairwise(f, x, y) - - @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + if throwsforzerolengthinput + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, [Int[]], [Int[]]) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + end res = pairwise(f, [], []) @test size(res) == (0, 0) @@ -69,7 +84,22 @@ arbitrary_fun(x, y) = cor(x, y) @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), x, y) @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), [], []) @test_throws DimensionMismatch pairwise!(f, zeros(0, 0), - [], [[1, 2], [2, 3]]) + [], [[1, 2], [2, 3]]) + + @test pairwise(f, x, y) == (pairwise(f, y, x))' + @test pairwise(f, x, y, skipmissing=:pairwise) == (pairwise(f, y, x, skipmissing=:pairwise))' + @test pairwise(f, x, y, skipmissing=:listwise) == (pairwise(f, y, x, skipmissing=:listwise))' + end + + if f in (cor, corkendall, corspearman) + @testset "handling === elements" begin + xm = [missing, 1, 3, 2] + xn = [NaN, 1, 3, 2] + xmn = [missing, NaN, 1, 3, 2] + @test isequal(pairwise(f, [xm, xm], [xm, copy(xm)]), [1.0 missing; missing missing]) + @test isequal(pairwise(f, [xn, xn], [xn, copy(xn)]), [1.0 NaN; NaN NaN]) + @test isequal(pairwise(f, [xmn, xmn], [xmn, copy(xmn)]), [1.0 missing; missing missing]) + end end @testset "missing values handling interface" begin @@ -79,77 +109,94 @@ arbitrary_fun(x, y) = cor(x, y) res = pairwise(f, xm, ym) @test res isa Matrix{Missing} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym) === res2 @test res ≅ res2 ≅ [missing for xi in xm, yi in ym] res = pairwise(f, xm, ym, skipmissing=:pairwise) @test res isa Matrix{Float64} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym, skipmissing=:pairwise) === res2 @test res ≅ res2 - @test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], - rtol=1e-6) + # Use myskipmissings rather than skipmissings to avoid deprecation warning + function myskipmissings(x, y) + #which = @. !(ismissing(x) || ismissing(y)) # not compatible with Julia 1.6 + which = similar(x, Bool) + for i in eachindex(x) + which[i] = !(ismissing(x[i]) || ismissing(y[i])) + end + return x[which], y[which] + end + + @test isapprox(res, [f(myskipmissings(xi, yi)...) for xi in xm, yi in ym], + rtol=1e-6) res = pairwise(f, ym, zm, skipmissing=:pairwise) - @test res isa Matrix{Float32} - res2 = zeros(Union{Float32, Missing}, size(res)) - @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 - @test res ≅ res2 - @test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], - rtol=1e-6) + if isrankcor + @test res isa Matrix{Float64} + @test isequal(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm]) + else + @test res isa Matrix{Float32} + res2 = zeros(Union{Float32,Missing}, size(res)) + @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm], + rtol=1e-6) + end nminds = mapreduce(x -> .!ismissing.(x), - (x, y) -> x .& y, - [xm; ym]) + (x, y) -> x .& y, + [xm; ym]) res = pairwise(f, xm, ym, skipmissing=:listwise) @test res isa Matrix{Float64} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym, skipmissing=:listwise) === res2 @test res ≅ res2 @test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym], - rtol=1e-6) + rtol=1e-6) if VERSION >= v"1.6.0-DEV" # inference of cor fails so use an inferrable function - # to check that pairwise itself is inferrable + # to check that StatsBase.pairwise itself is inferrable for skipmissing in (:none, :pairwise, :listwise) g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing) - @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == - Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, - Vector{Vector{Union{Float64, Missing}}}}) == - Matrix{<: Union{Float64, Missing}} + @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}, + Vector{Vector{Union{Float64,Missing}}}}) == + Matrix{<:Union{Float64,Missing}} if skipmissing in (:pairwise, :listwise) - @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == - Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, - Vector{Vector{Union{Float64, Missing}}}}) == - Matrix{Float64} + @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}, + Vector{Vector{Union{Float64,Missing}}}}) == + Matrix{Float64} end end end @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something) - @test_throws ArgumentError pairwise!(f, zeros(Union{Float64, Missing}, - length(xm), length(ym)), xm, ym, - skipmissing=:something) + @test_throws ArgumentError pairwise!(f, zeros(Union{Float64,Missing}, + length(xm), length(ym)), xm, ym, + skipmissing=:something) # variable with only missings xm = [fill(missing, 10), rand(10)] ym = [rand(10), rand(10)] res = pairwise(f, xm, ym) - @test res isa Matrix{Union{Float64, Missing}} - res2 = zeros(Union{Float64, Missing}, size(res)) + @test res isa Matrix{Union{Float64,Missing}} + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym) === res2 @test res ≅ res2 ≅ [f(xi, yi) for xi in xm, yi in ym] if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 - @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) - @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) + if throwsforzerolengthinput + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, xm, ym, skipmissing=:listwise) - res = zeros(Union{Float64, Missing}, length(xm), length(ym)) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) + res = zeros(Union{Float64,Missing}, length(xm), length(ym)) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, res, xm, ym, skipmissing=:listwise) + end end for sm in (:pairwise, :listwise) @@ -162,12 +209,11 @@ arbitrary_fun(x, y) = cor(x, y) @testset "iterators" begin x = (v for v in [rand(10) for _ in 1:4]) y = (v for v in [rand(10) for _ in 1:4]) - res = @inferred pairwise(f, x, y) + res2 = zeros(size(res)) @test pairwise!(f, res2, x, y) === res2 @test res == res2 == pairwise(f, collect(x), collect(y)) - res = @inferred(pairwise(f, x)) res2 = zeros(size(res)) @test pairwise!(f, res2, x) === res2 @@ -179,13 +225,13 @@ arbitrary_fun(x, y) = cor(x, y) y = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) @test pairwise((x, y) -> f(collect(x), collect(y)), x, y) == - [f(collect(xi), collect(yi)) for xi in x, yi in y] + [f(collect(xi), collect(yi)) for xi in x, yi in y] @test pairwise((x, y) -> f(collect(x), collect(y)), x) == - [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] + [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, - skipmissing=:pairwise) + skipmissing=:pairwise) @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, - skipmissing=:listwise) + skipmissing=:listwise) end @testset "two-argument method" begin @@ -201,8 +247,8 @@ arbitrary_fun(x, y) = cor(x, y) y = [rand(10) for _ in 1:4] @test pairwise(f, x, x, symmetric=true) == - pairwise(f, x, symmetric=true) == - Symmetric(pairwise(f, x, x), :U) + pairwise(f, x, symmetric=true) == + Symmetric(pairwise(f, x, x), :U) res = zeros(4, 4) res2 = zeros(4, 4) @@ -214,47 +260,64 @@ arbitrary_fun(x, y) = cor(x, y) @test_throws ArgumentError pairwise!(f, res, x, y, symmetric=true) end - @testset "cor corner cases" begin - # Integer inputs must give a Float64 output - res = pairwise(cor, [[1, 2, 3], [1, 5, 2]]) - @test res isa Matrix{Float64} - @test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), - yi in ([1, 2, 3], [1, 5, 2])] - - # NaNs are ignored for the diagonal - res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]]) - @test res isa Matrix{Float64} - @test res ≅ [1.0 NaN - NaN 1.0] - - # missings are ignored for the diagonal - res = pairwise(cor, [[1, 2, 7], [1, 5, missing]]) - @test res isa Matrix{Union{Float64, Missing}} - @test res ≅ [1.0 missing - missing 1.0] - res = pairwise(cor, Vector{Union{Int, Missing}}[[missing, missing, missing], - [missing, missing, missing]]) - @test res isa Matrix{Union{Float64, Missing}} - @test res ≅ [1.0 missing - missing 1.0] - if VERSION >= v"1.5" - # except when eltype is Missing - res = pairwise(cor, [[missing, missing, missing], - [missing, missing, missing]]) - @test res isa Matrix{Missing} - @test res ≅ [missing missing - missing missing] - end + if iscor + @testset "$f corner cases" begin + # Integer inputs must give a Float64 output + res = pairwise(f, [[1, 2, 3], [1, 5, 2]]) + @test res isa Matrix{Float64} + if f == corspearman + @test isapprox(res, [f(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])]) + else + @test res == [f(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])] + end - for sm in (:pairwise, :listwise) - res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + # NaNs are ignored for the diagonal + res = pairwise(f, [[1, 2, NaN], [1, 5, 2]]) @test res isa Matrix{Float64} @test res ≅ [1.0 NaN - NaN 1.0] + NaN 1.0] + + # missings are ignored for the diagonal + res = pairwise(f, [[1, 2, 7], [1, 5, missing]]) + @test res isa Matrix{Union{Float64,Missing}} + @test res ≅ [1.0 missing + missing 1.0] + res = pairwise(f, Vector{Union{Int,Missing}}[[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Union{Float64,Missing}} + @test res ≅ [1.0 missing + missing 1.0] + + #NaNs and missings are ignored for the "diagonal" of a not-square result. + v = [missing, 1, 2, 3] + @test isequal(pairwise(f, [v, v], [v, v, v]), + [1.0 missing missing; missing 1.0 missing]) + w = [NaN, 1, 2, 3] + @test isequal(pairwise(f, [w, w], [w, w, w]), [1.0 NaN NaN; NaN 1.0 NaN]) + if VERSION >= v"1.5" - @test_throws ArgumentError pairwise(cor, [[missing, missing, missing], - [missing, missing, missing]], - skipmissing=sm) + # except when eltype is Missing + res = pairwise(f, [[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Missing} + @test res ≅ [missing missing + missing missing] + end + + for sm in (:pairwise, :listwise) + res = pairwise(f, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + if VERSION >= v"1.5" + if f == cor + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, [[missing, missing, missing], + [missing, missing, missing]], + skipmissing=sm) + end + end end end end @@ -262,30 +325,45 @@ arbitrary_fun(x, y) = cor(x, y) @testset "promote_type_union" begin @test StatsBase.promote_type_union(Int) === Int @test StatsBase.promote_type_union(Real) === Real - @test StatsBase.promote_type_union(Union{Int, Float64}) === Float64 - @test StatsBase.promote_type_union(Union{Int, Missing}) === Union{Int, Missing} - @test StatsBase.promote_type_union(Union{Int, String}) === Any + @test StatsBase.promote_type_union(Union{Int,Float64}) === Float64 + @test StatsBase.promote_type_union(Union{Int,Missing}) === Union{Int,Missing} + @test StatsBase.promote_type_union(Union{Int,String}) === Any @test StatsBase.promote_type_union(Vector) === Any @test StatsBase.promote_type_union(Union{}) === Union{} if VERSION >= v"1.6.0-DEV" - @test StatsBase.promote_type_union(Tuple{Union{Int, Float64}}) === - Tuple{Real} + @test StatsBase.promote_type_union(Tuple{Union{Int,Float64}}) === + Tuple{Real} else - @test StatsBase.promote_type_union(Tuple{Union{Int, Float64}}) === - Any + @test StatsBase.promote_type_union(Tuple{Union{Int,Float64}}) === + Any end end @testset "type-unstable corner case (#771)" begin - v = [rand(5) for _=1:10] + v = [rand(5) for _ = 1:10] function f(v) pairwise(v) do x, y (x[1] < 0 ? nothing : - x[1] > y[1] ? 1 : 1.5, - 0) + x[1] > y[1] ? 1 : 1.5, + 0) end end res = f(v) - @test res isa Matrix{Tuple{Real, Int}} + @test res isa Matrix{Tuple{Real,Int}} + end +end + +@testset "pairwise against inputs with non-numeric elements" begin + for f in (corkendall, corspearman) + x = [["a", "b", "c"], ["c", "b", "a"]] + @test pairwise(f, x) ≈ [1.0 -1.0; -1.0 1.0] end +end + +@testset "pairwise with vectors of unequal length" begin + f(x, y) = length(x) - length(y) + x = [randn(i) for i in 0:3] + @test isequal(pairwise(f, x), (0:3) .- (0:3)') + @test_throws ArgumentError pairwise(f, x, skipmissing=:pairwise) + @test_throws ArgumentError pairwise(f, x, skipmissing=:listwise) end \ No newline at end of file diff --git a/test/rankcorr.jl b/test/rankcorr.jl index dc0207ee1..a08070408 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -1,161 +1,398 @@ -using StatsBase -using Test - -X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] -Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] - -x1 = X[:,1] -x2 = X[:,2] -y = Y[:,1] - -# corspearman - -@test corspearman(x1, y) ≈ -0.102597835208515 -@test corspearman(x2, y) ≈ -0.081110710565381 - -@test corspearman(X, y) ≈ [-0.102597835208515, -0.081110710565381] -@test corspearman(y, X) ≈ [-0.102597835208515 -0.081110710565381] - -c11 = corspearman(x1, x1) -c12 = corspearman(x1, x2) -c22 = corspearman(x2, x2) -@test c11 ≈ 1.0 -@test c22 ≈ 1.0 -@test corspearman(X, X) ≈ [c11 c12; c12 c22] -@test corspearman(X) ≈ [c11 c12; c12 c22] - -@test corspearman(X, Y) == - [corspearman(X[:,i], Y[:,j]) for i in axes(X, 2), j in axes(Y, 2)] - -# corkendall - -# Check error, handling of NaN, Inf etc -@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) -@test isnan(corkendall([1,2], [3,NaN])) -@test isnan(corkendall([1,1,1], [1,2,3])) -@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 - -# Test, with exact equality, some known results. -# AbstractVector{<:Real}, AbstractVector{<:Real} -@test corkendall(x1, y) == -1/sqrt(90) -@test corkendall(x2, y) == -1/sqrt(72) -# AbstractMatrix{<:Real}, AbstractVector{<:Real} -@test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] -# AbstractVector{<:Real}, AbstractMatrix{<:Real} -@test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] - -# n = 78_000 tests for overflow errors on 32 bit -# Testing for overflow errors on 64bit would require n be too large for practicality -# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. -n = 78_000 -# Test with many repeats -@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) -@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) -@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] -@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] -@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 - -# Test with no repeats, note testing for exact equality -@test corkendall(collect(1:n), collect(1:n)) == 1.0 -@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 - -# All elements identical should yield NaN -@test isnan(corkendall(repeat([1], n), collect(1:n))) - -c11 = corkendall(x1, x1) -c12 = corkendall(x1, x2) -c22 = corkendall(x2, x2) - -# AbstractMatrix{<:Real}, AbstractMatrix{<:Real} -@test corkendall(X, X) ≈ [c11 c12; c12 c22] -# AbstractMatrix{<:Real} -@test corkendall(X) ≈ [c11 c12; c12 c22] - -@test c11 == 1.0 -@test c22 == 1.0 -@test c12 == 3/sqrt(20) - -# Finished testing for overflow, so redefine n for speedier tests -n = 100 - -@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] -@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] - -# All eight three-element permutations -z = [1 1 1; - 1 1 2; - 1 2 2; - 1 2 2; - 1 2 1; - 2 1 2; - 1 1 2; - 2 2 2] - -@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z[:,1], z) == [1 0 1/3] -@test corkendall(z, z[:,1]) == [1; 0; 1/3] - -z = float(z) -@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z[:,1], z) == [1 0 1/3] -@test corkendall(z, z[:,1]) == [1; 0; 1/3] - -w = repeat(z, n) -@test corkendall(w) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(w, w) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(w[:,1], w) == [1 0 1/3] -@test corkendall(w, w[:,1]) == [1; 0; 1/3] - -StatsBase.midpoint(1,10) == 5 -StatsBase.midpoint(1,widen(10)) == 5 - - -# NaN handling - -Xnan = copy(X) -Xnan[1,1] = NaN -Ynan = copy(Y) -Ynan[2,1] = NaN - -for f in (corspearman, corkendall) - @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) - @test all(isnan, f([1.0, NaN], [1 2; 3 4])) - @test all(isnan, f([1 2; 3 4], [1.0, NaN])) - @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) - @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) - @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) - - @test isequal(f(Xnan, Ynan), - [f(Xnan[:,i], Ynan[:,j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) - @test isequal(f(Xnan), - [i == j ? 1.0 : f(Xnan[:,i], Xnan[:,j]) - for i in axes(Xnan, 2), j in axes(Xnan, 2)]) - for k in 1:2 - @test isequal(f(Xnan[:,k], Ynan), - [f(Xnan[:,k], Ynan[:,j]) for i in 1:1, j in axes(Ynan, 2)]) - # TODO: fix corkendall (PR#659) - if f === corspearman - @test isequal(f(Xnan, Ynan[:,k]), - [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2), j in 1:1]) - else - @test isequal(f(Xnan, Ynan[:,k]), - [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2)]) - end - end +using StatsBase, Random, Test + +@testset "rank_correlation_auxiliary_fns" begin + + #Auxiliary functions for corkendall + n = 100 + x = [1, 2, 3, missing, 4] + y = [missing, 1, 2, 3, 4] + u = [missing, missing, 1, 2] + v = [3, 4, missing, missing] + + mx = [1 2 + missing 3 + 4 missing + missing missing + 5 6] + + @test StatsBase.handle_pairwise(x, y) == ([2, 3, 4], [1, 2, 4]) + @test StatsBase.handle_pairwise(float.(x), y) == ([2.0, 3.0, 4.0], [1, 2, 4]) + @test StatsBase.handle_pairwise(x, float.(y)) == ([2, 3, 4], [1.0, 2.0, 4.0]) + @test StatsBase.handle_pairwise(u, v) == (Int64[], Int64[]) + + ranksx = [missing missing] + ranksy = [1 2] + @test isequal(StatsBase._cor(ranksx, ranksy), [NaN NaN; NaN NaN]) + @test isequal(StatsBase._cor(ranksx, ranksx), [missing NaN; NaN missing]) + @test isequal(StatsBase._cor(ranksy, ranksy), [1.0 NaN; NaN 1.0]) + + ranksx = [missing missing; missing missing] + @test isequal(StatsBase._cor(ranksx, ranksx), [missing missing; missing missing]) + + Random.seed!(1) + ranksx = rand(10,5) ;ranksy = rand(10,6) + @test StatsBase._cor(ranksx,ranksx) == cor(ranksx) + @test StatsBase._cor(ranksx,ranksy) == cor(ranksx,ranksy) + + ranksx = fill(missing,3,3);ranksy = hcat(ranksx,[1,2,3]) + @test isequal(StatsBase._cor(ranksx,ranksx),fill(missing,3,3)) + @test isequal(StatsBase._cor(ranksx,ranksy),fill(missing,3,4)) + + v = collect(100:-1:1) + StatsBase.insertion_sort!(v, 1, n) + @test v == 1:n + + v = collect(1000:-1:1) + StatsBase.merge_sort!(v, 1, 1000) + @test v == 1:1000 + + @test StatsBase.midpoint(1, 10) == 5 + @test StatsBase.midpoint(1, widen(10)) == 5 + + for n in vcat(1:5, 10:20:90, 1000), nss in [1, 4, 8, 20, 32, 64] + #check is a partition + @test sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(n, nss)]...)) == 1:n + #check near-equal lengths + lengths = [length(s) for s in StatsBase.EqualSumSubsets(n, nss)] + @test (maximum(lengths) - minimum(lengths)) <= 1 + #check near-equal sums + sums = [sum(collect(s)) for s in StatsBase.EqualSumSubsets(n, nss)] + @test (maximum(sums) - minimum(sums)) < nss + end + end +#= +Edge cases (to refer to when writing tests) +=========================================== +In the symmetric case (x===y) on-diagonal terms should be 1.0 apart from the special case of +eltype(x) === Missing && skipmissing == :none, in which case the on-diagonal terms should be +missing. +Otherwise, if x and y are equal-length vectors with length(x)<2 +f(x,y) == NaN +otherwise if x and y are equal-length vectors and either contains missing +f(x,y) == missing +otherwise if x and y are equal-length vectors and either contains NaN +f(x,y) == NaN +otherwise +f(x,y) = the required Kendall or Spearman correlation. + +This behaviour is close to but not quite identical to Statistics.cor. e.g.: + +julia> cor(fill(missing,3,2)) +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> corkendall(fill(missing,3,2)) #SAME behavior as cor +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> cor(Matrix{Union{Missing,Float64}}(missing,5,3)) +3×3 Matrix{Missing}: + missing missing missing + missing missing missing + missing missing missing + +julia> corkendall(Matrix{Union{Missing,Float64}}(missing,5,3)) #DIFFERENT behaviour from cor +3×3 Matrix{Union{Missing, Float64}}: + 1.0 missing missing + missing 1.0 missing + missing missing 1.0 +=# + +@testset "$f" for f in (corkendall, corspearman) + + n = 100 + big_n = 78_000 + X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] + Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] + Xm = [1 0; missing 1; 2 1; 3 0; 4 1; 5 10] + Ym = [5 5 6; 1 2 3; 3 4 1; 4 0 4; 2 6 1; 5 7 10] + xm = [missing, missing, missing, missing, missing] + xmm = hcat(xm, xm) + a = [5, 2, 3, 4, 1] + b = [1, 4, 2, 3, 5] + + x1 = X[:, 1] + x2 = X[:, 2] + y1 = Y[:, 1] + + c11 = f(x1, x1) + c12 = f(x1, x2) + c22 = f(x2, x2) + + # Test some known results + if f == corkendall + # Vector, Vector + @test f(x1, y1) == -1 / sqrt(90) + @test f(x2, y1) == -1 / sqrt(72) + # Matrix, Vector + @test f(X, y1) == [-1 / sqrt(90), -1 / sqrt(72)] + # Vector, Matrix + @test f(y1, X) == [-1 / sqrt(90) -1 / sqrt(72)] + + # big_n = 78_000 tests for overflow errors on 32 bit + # Testing for overflow errors on 64 bit is not practical + # This also tests merge_sort! since big_n is (much) greater than SMALL_THRESHOLD. + # Test with many repeats + @test f(repeat(x1, big_n), repeat(y1, big_n)) ≈ -1 / sqrt(90) + @test f(repeat(x2, big_n), repeat(y1, big_n)) ≈ -1 / sqrt(72) + @test f(repeat(X, big_n), repeat(y1, big_n)) ≈ [-1 / sqrt(90), -1 / sqrt(72)] + @test f(repeat(y1, big_n), repeat(X, big_n)) ≈ [-1 / sqrt(90) -1 / sqrt(72)] + elseif f == corspearman + @test corspearman(x1, y1) ≈ -1 / sqrt(95) + @test corspearman(repeat(x1, 1000), repeat(y1, 1000)) ≈ -1 / sqrt(95) + @test corspearman(vcat(missing, x1, missing), vcat(missing, y1, missing), skipmissing=:pairwise) ≈ -1 / sqrt(95) + @test corspearman(x2, y1) ≈ -3 / sqrt(1368) + @test corspearman(X, y1) ≈ [-1 / sqrt(95), -3 / sqrt(1368)] + @test corspearman(y1, X) ≈ [-1 / sqrt(95) -3 / sqrt(1368)] + end + + # Matrix, Matrix + @test f(X, X) ≈ [c11 c12; c12 c22] + # Matrix + @test f(X) ≈ [c11 c12; c12 c22] + + @test c11 == 1.0 + @test c22 == 1.0 + if f == corkendall + @test c12 == 3 / sqrt(20) + elseif f == corspearman + @test c12 == 7 / sqrt(90) + end + @test f(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] + @test f(repeat(X, n)) ≈ [c11 c12; c12 c22] + @test f(X, Y) ≈ + [f(X[:, i], Y[:, j]) for i in axes(X, 2), j in axes(Y, 2)] + + @test f(vcat(missing, a), vcat(missing, b), skipmissing=:pairwise) == f(a, b) + @test f(vcat(a, missing), vcat(missing, b), skipmissing=:pairwise) == + f(a[2:end], b[1:(end-1)]) + @test f(hcat(vcat(a, missing), vcat(missing, b)), skipmissing=:listwise) == + f(hcat(a[2:end], b[1:(end-1)])) + @test f(Xm, Xm, skipmissing=:pairwise) == f(X, X) + @test f(Xm, Xm, skipmissing=:listwise) == f(X, X) + @test f(Xm, Ym, skipmissing=:listwise) == f(X, Y) + if f == corkendall + @test f(Xm, Ym, skipmissing=:pairwise) ≈ [-1/√90 0.4 1/√90; -2/√154 7/√165 -1/√154] + end + @test isnan(f([1, 2, 3, 4, 5], xm, skipmissing=:pairwise)) + @test isnan(f(xm, [1, 2, 3, 4, 5], skipmissing=:pairwise)) + @test isequal(f(xmm, skipmissing=:pairwise), [1.0 NaN; NaN 1.0]) + @test isequal(f(xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, copy(xmm), skipmissing=:none), + [missing missing; missing missing]) + @test isequal(f(xmm, xmm, skipmissing=:listwise), [1.0 NaN; NaN 1.0]) + @test isequal(f(xmm, copy(xmm), skipmissing=:listwise), [NaN NaN; NaN NaN]) + @test isequal(f(xmm, copy(xmm), skipmissing=:pairwise), [NaN NaN; NaN NaN]) + @test ismissing(f([1, 2, 3, 4, 5], xm, skipmissing=:none)) + @test ismissing(f([1, 2, 3, 4, 5], xm, skipmissing=:none)) + @test isequal(f(xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, copy(xmm), skipmissing=:none), + [missing missing; missing missing]) + @test isequal(f(hcat(Y, xm), skipmissing=:none), vcat(hcat(f(Y, skipmissing=:none), + [missing, missing, missing]), [missing missing missing 1.0])) + @test_throws ArgumentError f([1, 2, 3, 4], [4, 3, 2, 1], skipmissing=:listwise) + + # All elements identical should yield NaN + @test isnan(f(repeat([1], n), collect(1:n))) + @test f(repeat([0, 1, 1, 0], n), repeat([1, 0, 1, 0], n)) == 0.0 + + # Test with no repeats + @test f(collect(1:n), collect(1:n)) == 1.0 + @test f(collect(1:n), reverse(collect(1:n))) == -1.0 + + # Inf and -Inf work ok + @test f([-Inf, -0.0, Inf], [1, 2, 3]) == 1.0 + + #Interaction of NaN and missing with skipmissing argument + nan_and_missing = hcat(fill(NaN, 10), fill(missing, 10), + vcat(fill(NaN, 5), fill(missing, 5))) + @test isequal(f(nan_and_missing, skipmissing=:none), + [1.0 missing missing; missing 1.0 missing; missing missing 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:none), + [NaN missing missing; missing missing missing; missing missing missing]) + @test isequal(f(nan_and_missing, skipmissing=:pairwise), + [1.0 NaN NaN; NaN 1.0 NaN; NaN NaN 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:pairwise), + fill(NaN, 3, 3)) + @test isequal(f(nan_and_missing, skipmissing=:listwise), + [1.0 NaN NaN; NaN 1.0 NaN; NaN NaN 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:listwise), + fill(NaN, 3, 3)) + + #Reject nonsense skipmissing argument + @test_throws ArgumentError f(X; skipmissing=:foo) + @test_throws ArgumentError f(Xm; skipmissing=:foo) + + # Inputs have fewer than 2 rows + @test isequal(f([], []), NaN) + @test isequal(f(fill(1, 0, 2), fill(1, 0, 2)), [NaN NaN; NaN NaN]) + @test isequal(f(fill(1, 0, 2)), [1.0 NaN; NaN 1.0]) + @test isequal(f(reshape([1], (1, 1)), reshape([1], (1, 1))), reshape([NaN], (1, 1))) + @test isequal(f(reshape([1], (1, 1))), reshape([1.0], (1, 1))) + @test isequal(f([missing], [missing]), NaN) + @test isequal(f([1], [1]), NaN) + @test isequal(f([NaN], [NaN]), NaN) + @test isequal(f([1], [1]), NaN) + @test isequal(f([]), 1.0) + @test isequal(f([1]), 1.0) + @test isequal(f([NaN]), 1.0) + @test isequal(f([missing]), missing) + @test isequal(f([missing]), missing) + @test isequal(f([missing], [missing missing]), [NaN NaN]) + @test isequal(f([missing missing]), [missing NaN; NaN missing]) + @test isequal(f([missing missing], [missing missing]), [NaN NaN; NaN NaN]) + + # Exercise catch block in method _cor (when f === corspearman) + @test isequal(f(vcat([1 2 3], fill(missing, 2, 3))), + [1.0 missing missing; missing 1.0 missing; missing missing 1.0]) + # Exercise "correction" of on-diagonal terms in method + # _pairwise!(::Val{:none}, f::typeof(corspearman),... + @test isequal(f([missing missing; 1 1; 1 1]), [1.0 missing; missing 1.0]) + + # Works for not-numbers + @test isequal(f(["a", "b", "c"], ["a", "b", "c"]), 1.0) + @test isequal(f(["a", "b", "c"], ["c", "b", "a"]), -1.0) + @test (f(["a" "z"; "b" "y"; "c" "x"]) ≈ [1.0 -1.0; -1.0 1.0]) + @test (f(["a" 3; "b" 2; "c" 1]) ≈ [1.0 -1.0; -1.0 1.0]) + + #Works for zero size input ( [;;] not compatible with Julia 1.0.5) + let nada = Array{Any,2}(undef, 0, 0) + @test isequal(f(nada), nada) + @test isequal(f(nada, nada), nada) + @test isequal(f(nada, nada, skipmissing=:pairwise), nada) + @test isequal(f(nada, nada, skipmissing=:listwise), nada) + end + + # Wrong dimensions + @test_throws DimensionMismatch f([1], [1, 2]) + @test_throws DimensionMismatch f([1], [1 2; 3 4]) + @test_throws DimensionMismatch f([1 2; 3 4], [1]) + @test_throws DimensionMismatch f([1 2; 3 4; 4 6], [1 2; 3 4]) + + # All eight three-element permutations, for these cases corspearman and corkendall + # have the same return values + z = [1 1 1; + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + + @test f(z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z, z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z[:, 1], z) ≈ [1 0 1 / 3] + @test f(z, z[:, 1]) ≈ [1; 0; 1 / 3] + + z = float(z) + @test f(z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z, z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z[:, 1], z) ≈ [1 0 1 / 3] + @test f(z, z[:, 1]) ≈ [1; 0; 1 / 3] + + w = repeat(z, n) + @test f(w) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(w, w) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(w[:, 1], w) ≈ [1 0 1 / 3] + @test f(w, w[:, 1]) ≈ [1; 0; 1 / 3] + + # NaN handling + Xnan = copy(X) + Xnan[1, 1] = NaN + Ynan = copy(Y) + Ynan[2, 1] = NaN + + @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) + @test all(isnan, f([1.0, NaN], [1 2; 3 4])) + @test all(isnan, f([1 2; 3 4], [1.0, NaN])) + @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) + @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) + @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) + + @test isequal(f(Xnan, Ynan), + [f(Xnan[:, i], Ynan[:, j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) + @test isequal(f(Xnan), + [i == j ? 1.0 : f(Xnan[:, i], Xnan[:, j]) + for i in axes(Xnan, 2), j in axes(Xnan, 2)]) + for k in 1:2 + @test isequal(f(Xnan[:, k], Ynan), + [f(Xnan[:, k], Ynan[:, j]) for i in 1:1, j in axes(Ynan, 2)]) + # TODO: fix corkendall (PR#659) + if f === corspearman + @test isequal(f(Xnan, Ynan[:, k]), + [f(Xnan[:, i], Ynan[:, k]) for i in axes(Xnan, 2), j in 1:1]) + else + @test isequal(f(Xnan, Ynan[:, k]), + [f(Xnan[:, i], Ynan[:, k]) for i in axes(Xnan, 2)]) + end + end -# Wrong dimensions +end -@test_throws DimensionMismatch corspearman([1], [1, 2]) -@test_throws DimensionMismatch corspearman([1], [1 2; 3 4]) -@test_throws DimensionMismatch corspearman([1 2; 3 4], [1]) -@test_throws ArgumentError corspearman([1 2; 3 4; 4 6], [1 2; 3 4]) +# Don't artificially boost coverage stats when checking for mutation and allocation size +# COV_EXCL_START +@testset "Check no mutation in $f" for f in (corkendall, corspearman) + + nr = 50 + nc = 5 + cutoff = min(0.1, 10 / (nr * nc)) + X = randn(nr, nc) + x = randn(nr) + Xm = ifelse.(rand(nr, nc) .< cutoff, missing, randn(nr, nc)) + xm = ifelse.(rand(nr) .< cutoff, missing, randn(nr)) + Y = randn(nr, nc) + y = randn(nr) + Ym = ifelse.(rand(nr, nc) .< cutoff, missing, randn(nr, nc)) + ym = ifelse.(rand(nr) .< cutoff, missing, randn(nr)) + + for arg1 in (X, x, Xm, xm) + for arg2 in (Y, y, Ym, ym) + for skipmissing in (:pairwise, :none, :listwise) + for f in (corkendall, corspearman) + if !((arg1 isa Vector) && (arg2 isa Vector) && skipmissing == :listwise) + copy1 = copy(arg1) + copy2 = copy(arg2) + res = f(arg1, arg2; skipmissing=skipmissing) + @test isequal(arg1, copy1) + @test isequal(arg2, copy2) + end + end + end + end + end +end -# TODO: fix corkendall to match corspearman (PR#659) -@test_throws ErrorException corkendall([1], [1, 2]) -@test_throws ErrorException corkendall([1], [1 2; 3 4]) -@test_throws ErrorException corkendall([1 2; 3 4], [1]) -@test_throws ErrorException corkendall([1 2; 3 4; 4 6], [1 2; 3 4]) +@testset "corkendall and corspearman allocations" begin + + Random.seed!(1) + x = rand(1000, 10) + xm = ifelse.(x .< 0.1, missing, x) + #compile + corkendall(x) + corkendall(xm, skipmissing=:listwise) + corkendall(xm, skipmissing=:pairwise) + corspearman(x) + corspearman(xm, skipmissing=:listwise) + corspearman(xm, skipmissing=:pairwise) + x = rand(1000, 100) + xm = ifelse.(x .< 0.01, missing, x) + + #=When executing code such as corkendall(x) allocations are approximately affine in the + number of threads, thanks to use of task-local storage. The tests below have a "safety + factor" of 1.2 against the expected size of allocations. + =# + @test (@allocated corkendall(x)) < (896_144 + Threads.nthreads() * 57_976) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:listwise)) < (1_117_728 + Threads.nthreads() * 22_104) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:pairwise)) < (890_448 + Threads.nthreads() * 61_048) * 1.2 + @test (@allocated corspearman(x)) < (2_678_448 + Threads.nthreads() * 9_128) * 1.2 + @test (@allocated corspearman(xm, skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 + @test (@allocated corspearman(xm, skipmissing=:pairwise)) < (1_690_544 + Threads.nthreads() * 67_104) * 1.2 + +end +# COV_EXCL_STOP \ No newline at end of file