-
Notifications
You must be signed in to change notification settings - Fork 120
/
Copy pathcounts.jl
38 lines (34 loc) · 1.66 KB
/
counts.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# wrapper for StatsBase.counts(a::Vector, b::Vector, (1:maxA, 1:maxB))
function _counts(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
n = length(a)
n == length(b) ||
throw(DimensionMismatch("Assignment vectors have different lengths ($n and $(length(b)))"))
# NOTE: StatsBase.counts() throws ArgumentError for empty vectors
(n == 0) && return Matrix{Int}(undef, 0, 0)
minA, maxA = extrema(a)
minB, maxB = extrema(b)
(minA > 0 && minB > 0) ||
throw(ArgumentError("Cluster indices should be positive integers"))
# note: ignoring minA/minB, always start from 1 to match
# cluster indices and counts matrix positions
return counts(a, b, (1:maxA, 1:maxB))
end
"""
counts(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int}
counts(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int}
counts(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int}
Calculate the *cross tabulation* (aka *contingency matrix*) for the two
clusterings of the same data points.
Returns the ``n_a × n_b`` matrix `C`, where ``n_a`` and ``n_b`` are the
numbers of clusters in `a` and `b`, respectively, and `C[i, j]` is
the size of the intersection of `i`-th cluster from `a` and `j`-th cluster
from `b`.
The clusterings could be specified either as [`ClusteringResult`](@ref)
instances or as vectors of data point assignments.
"""
counts(a::ClusteringResult, b::ClusteringResult) =
_counts(assignments(a), assignments(b))
counts(a::AbstractVector{<:Integer}, b::ClusteringResult) =
_counts(a, assignments(b))
counts(a::ClusteringResult, b::AbstractVector{<:Integer}) =
_counts(assignments(a), b)