diff --git a/doc/source/hclust.md b/doc/source/hclust.md new file mode 100644 index 00000000..94fc8045 --- /dev/null +++ b/doc/source/hclust.md @@ -0,0 +1,56 @@ +# HClust + +Hierarchical Clustering for Julia, similar to R's `hclust()` + +Status +====== + +The package is currently work-in-progress. Clustering involves doing a lot of admin, and it is easy to make an error. I've tested the results for medium sized clusters (up to 250---5000) elements, for the following methods: + +| method | validated at matrix size | time | validated | +|-------------|------------------------|------|-----------| +| `:single` | 5000 | 1.3 | OK +| `:complete` | 2500 | 4.5 | OK +| `:average` | 2500 | 4.5 | OK + +Usage +===== + +```julia +d = rand(1000,1000) +d += d' ## make sure distance matrix d is symmetric (this is optional) +h = hclust(d, :single) +``` + +hclust() +------ + +```julia +hclust(distance::Matrix, method::Symbol) +``` + +Performs hierarchical clustering for distance matrix `d` (which is forced to be symmetric), using one of three methods: + - `:single`: cluster distance is equal to the minimum distance between any of the members + - `:average`: cluster distance is equal to the mean distance between any of the cluster's members + - `:complete`: cluster distance is equal to the maximum distance between any of the members. + +The output of `hclust()` is an object of type `Hclust` with the fields + + - `merge` the clusters merged in order. Leafs are indicated by negative numbers + - `height` the distance at which the merges take place + - `order` a preferred grouping for drawing a dendogram. Not implemented, always `[1:n]`. + - `labels` labels of the clusters. Not implemented, now always `[1:n]` + - `method` the name of the clustering method. + +cutree() +-------- + +```julia +cutree(cl:Hclust; h, k) +``` + +Cuts the cluster tree at height `h` or amounting to `k` clusters. + +The output is a vector of indices. The `n`th element in this vector indicates the cluster that this data point belongs to. + + diff --git a/src/Clustering.jl b/src/Clustering.jl index 9085d023..e036fd9d 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -42,7 +42,10 @@ module Clustering silhouettes, # varinfo - varinfo + varinfo, + + # hclust + Hclust, hclust, cutree ## source files @@ -58,5 +61,7 @@ module Clustering include("silhouette.jl") include("varinfo.jl") + include("hclust.jl") + include("deprecate.jl") end diff --git a/src/hclust.jl b/src/hclust.jl new file mode 100644 index 00000000..c5f037d3 --- /dev/null +++ b/src/hclust.jl @@ -0,0 +1,388 @@ +## hclust.jl (c) 2014 David A. van Leeuwen +## Hierarchical clustering, similar to R's hclust() + +## Algorithms are based upon C. F. Olson, Parallel Computing 21 (1995) 1313--1325. + +## This is also in types.jl, but that is not read... +## Mostly following R's hclust class +type Hclust{T<:Real} + merge::Matrix{Int} + height::Vector{T} + order::Vector{Int} + labels::Vector + method::Symbol +end + +function assertdistancematrix(d::AbstractMatrix) + nr, nc = size(d) + nr == nc || throw(DimensionMismatch("Distance matrix should be square.")) + issym(d) || error("Distance matrix should be symmetric.") +end + +## This seems to work like R's implementation, but it is extremely inefficient +## This probably scales O(n^3) or worse. We can use it to check correctness +function hclust_n3{T<:Real}(d::AbstractMatrix{T}, method::Function) + assertdistancematrix(d) + mr = Int[] # min row + mc = Int[] # min col + h = T[] # height + nc = size(d,1) # number of clusters + cl = -[1:nc] # segment to cluster attribution, initially negative + next = 1 # next cluster label + while next < nc + mindist = Inf + mi = mj = 0 + cli = unique(cl) + mask = BitVector(nc) + for j in 1:length(cli) # loop over for lower triangular indices, i>j + cols = cl .== cli[j] + for i in (j+1):length(cli) + rows = cl.==cli[i] + distance = method(d[rows,cols]) # very expensive + if distance < mindist + mindist = distance + mi = cli[i] + mj = cli[j] + mask = cols | rows + end + end + end + ## simulate R's order + if mi < 0 && mj < 0 && mi > mj || + mi > 0 && mj > 0 && mi < mj || + mi < 0 && mj > 0 + push!(mr, mi) + push!(mc, mj) + else + push!(mr, mj) + push!(mc, mi) + end + push!(h, mindist) + cl[mask] = next + next += 1 + end + hcat(mr, mc), h +end + +## Efficient single link algorithm, according to Olson, O(n^2), fig 2. +## Verified against R's implementation, correct, and about 2.5 x faster +## For each i < j compute D(i,j) (this is already given) +## For each 0 < i <= n compute Nearest Neighbor N(i) +## Repeat n-1 times +## find i,j that minimize D(i,j) +## merge clusters i and j +## update D(i,j) and N(i) accordingly +function hclust_minimum{T<:Real}(ds::Symmetric{T}) + ## For each i < j compute d[i,j] (this is already given) + d = full(ds) # we need a local copy + nc = size(d,1) + mr = Array(Int, nc-1) # min row + mc = Array(Int, nc-1) # min col + h = Array(T, nc-1) # height + merges = -[1:nc] + next = 1 + ## For each 0 < i <= n compute Nearest Neighbor N[i] + N = zeros(Int, nc) + for k = 1:nc + mindist = Inf + mk = 0 + for i = 1:(k-1) + if d[i,k] < mindist + mindist = d[i,k] + mk = i + end + end + for j = (k+1):nc + if d[k,j] < mindist + mindist = d[k,j] + mk = j + end + end + N[k] = mk + end + ## the main loop + while nc > 1 # O(n) + mindist = d[1,N[1]] + i = 1 + for k in 2:nc # O(n) + if k < N[k] + distance = d[k,N[k]] + else + distance = d[N[k],k] + end + if distance < mindist + mindist = distance + i = k + end + end + j = N[i] + if i > j + i, j = j, i # make sure i < j + end + ## update result, compatible to R's order. It must be possible to do this simpler than this... + if merges[i] < 0 && merges[j] < 0 && merges[i] > merges[j] || + merges[i] > 0 && merges[j] > 0 && merges[i] < merges[j] || + merges[i] < 0 && merges[j] > 0 + mr[next] = merges[i] + mc[next] = merges[j] + else + mr[next] = merges[j] + mc[next] = merges[i] + end + h[next] = mindist + merges[i] = next + merges[j] = merges[nc] + ## update d, split in ranges k d[k,j] + d[k,i] = d[k,j] + end + end + for k = (i+1):(j-1) # i < k < j + if d[i,k] > d[k,j] + d[i,k] = d[k,j] + end + end + for k = (j+1):nc # j < k <= nc + if d[i,k] > d[j,k] + d[i,k] = d[j,k] + end + end + ## move the last row/col into j + for k=1:(j-1) # k==nc, + d[k,j] = d[k,nc] + end + for k=(j+1):(nc-1) + d[j,k] = d[k,nc] + end + ## update N[k], k !in (i,j) + for k=1:nc + if N[k] == j # update nearest neigbors != i + N[k] = i + elseif N[k] == nc + N[k] = j + end + end + N[j] = N[nc] # update N[j] + ## update nc, next + nc -= 1 + next += 1 + ## finally we need to update N[i], because it was nearest to j + mindist = Inf + mk = 0 + for k=1:(i-1) + if d[k,i] < mindist + mindist = d[k,i] + mk = k + end + end + for k = (i+1):nc + if d[i,k] < mindist + mindist = d[i,k] + mk = k + end + end + N[i] = mk +# for n in N[1:nc] print(n, " ") end; println() + end + return hcat(mr, mc), h +end + + +## functions to compute maximum, minimum, mean for just a slice of an array + +function slicemaximum{T<:Real}(d::AbstractMatrix{T}, cl1::Vector{Int}, cl2::Vector{Int}) + maxdist = -Inf + for i in cl1 for j in cl2 + if d[i,j] > maxdist + maxdist = d[i,j] + end + end end + maxdist +end + +function sliceminimum{T<:Real}(d::AbstractMatrix{T}, cl1::Vector{Int}, cl2::Vector{Int}) + mindist = Inf + for i in cl1 for j in cl2 + if d[i,j] < mindist + mindist = d[i,j] + end + end end + mindist +end + +function slicemean{T<:Real}(d::AbstractMatrix{T}, cl1::Vector{Int}, cl2::Vector{Int}) + s = zero(T) + for i in cl1 for j in cl2 + s += d[i,j] + end end + s / (length(cl1)*length(cl2)) +end + +## This reorders the pairs to be compatible with R's hclust() +function rorder!(mr, mc, h) + o = sortperm(h) + io = invperm(o) + for i in 1:length(mr) + if mr[i] > 0 + mr[i] = io[mr[i]] + end + if mc[i] > 0 + mc[i] = io[mc[i]] + end + ## R's order of pairs + if ! (mr[i] < 0 && mc[i] < 0 && mr[i] > mc[i] || + mr[i] > 0 && mc[i] > 0 && mr[i] < mc[i] || + mr[i] < 0 && mc[i] > 0) + mr[i], mc[i] = mc[i], mr[i] + end + end + return o +end + +## Another nearest neighbor algorithm, for reducible metrics +## From C. F. Olson, Parallel Computing 21 (1995) 1313--1325, fig 5 +## Verfied against R implementation for mean and maximum, correct but ~ 5x slower +## Pick c1: 0 <= c1 <= n random +## i <- 1 +## repeat n-1 times +## repeat +## i++ +## c[i] = nearest neigbour c[i-1] +## until c[i] = c[i-2] ## nearest of nearest is cluster itself +## merge c[i] and nearest neigbor c[i] +## if i>3 i -= 3 else i <- 1 +function hclust2{T<:Real}(d::Symmetric{T}, method::Function) + nc = size(d,1) + mr = Array(Int, nc-1) # min row + mc = Array(Int, nc-1) # min col + h = Array(T, nc-1) # height + cl = map(x->[x], 1:nc) # clusters + merges = -[1:nc] + next = 1 + i = 1 + N = Array(Int, nc) + N[1] = 1 # arbitrary choice + while nc > 1 + found=false + mindist = Inf + while !found + i += 1 + mi = 0 + mindist = Inf + Nim1 = N[i-1] + ## c[i] = nearest neigbour c[i-1] + for j = 1:nc if Nim1 != j + distance = method(d, cl[Nim1], cl[j]) + if distance < mindist + mindist = distance + mi = j + end + end end + N[i] = mi # N[i+1] is nearest neigbor to N[i] + found = i > 2 && N[i] == N[i-2] + end + ## merge c[i] and nearest neigbor c[i], i.e., c[i-1] + if N[i-1] < N[i] + lo, high = N[i-1], N[i] + else + lo, high = N[i], N[i-1] + end + ## first, store the result + mr[next] = merges[lo] + mc[next] = merges[high] + h[next] = mindist + merges[lo] = next + merges[high] = merges[nc] + next += 1 + ## then perform the actual merge + cl[lo] = vcat(cl[lo], cl[high]) + cl[high] = cl[nc] + if i>3 + i -= 3 + else + i = 1 + end + ## replace any nearest neighbor referring to nc + for k=1:i + if N[k] == nc + N[k] = high + end + end + nc -= 1 + end + ## fix order for presenting result + o = rorder!(mr, mc, h) + hcat(mr[o], mc[o]), h[o] +end + +## this calls the routine that gives the correct answer, fastest +## method names are inspired by R's hclust +function hclust{T<:Real}(d::Symmetric{T}, method::Symbol) + if method == :single + h = hclust_minimum(d) + elseif method == :complete + h = hclust2(d, slicemaximum) + elseif method == :average + h = hclust2(d, slicemean) + else + error("Unsupported method ", method) + end + nc = size(d,1) + ## order and label are placeholders for the moment + Hclust(h..., [1:nc], [1:nc], method) +end + +## uplo may be Char for v0.3, Symbol for v0.4 +hclust{T<:Real}(d::AbstractMatrix{T}, method::Symbol, uplo) = hclust(Symmetric(d, uplo), method) + +function hclust{T<:Real}(d::AbstractMatrix{T}, method::Symbol) + assertdistancematrix(d) + hclust(Symmetric(d), method) +end + + +## cut a tree at height `h' or to `k' clusters +function cutree(hclust::Hclust; k::Int=1, + h::Real=maximum(hclust.height)) + clusters = Vector{Int}[] + nnodes = length(hclust.labels) + nodes = [[i::Int] for i=1:nnodes] + N = nnodes - k + i = 1 + while i<=N && hclust.height[i] <= h + both = vec(hclust.merge[i,:]) + new = Int[] + for x in both + if x<0 + push!(new, -x) + nodes[-x] = [] + else + append!(new, clusters[x]) + clusters[x] = [] + end + end + push!(clusters, new) + i += 1 + end + all = vcat(clusters, nodes) + all = all[map(length, all) .> 0] + ## convert to a single array of cluster indices + res = Array(Int, nnodes) + for (i,cl) in enumerate(all) + res[cl] = i + end + res +end + +## some diagnostic functions, not exported +function printupper(d::Matrix) + n = size(d,1) + for i = 1:(n-1) + print(" " ^ ((i-1) * 6)) + for j = (i+1):n + print(@sprintf("%5.2f ", d[i,j])) + end + println() + end +end diff --git a/src/types.jl b/src/types.jl index be915b04..a3765409 100644 --- a/src/types.jl +++ b/src/types.jl @@ -56,3 +56,12 @@ function show(io::IO, results::DPMeansOutput) print(io, "Algorithm Converged: ") println(io, results.converged) end + +## Mostly following R's hclust class +type Hclust{T<:Real} + merge::Matrix{Int} + height::Vector{T} + order::Vector{Int} + labels::Vector + method::Symbol +end diff --git a/test/hclust.R b/test/hclust.R new file mode 100644 index 00000000..a36bea55 --- /dev/null +++ b/test/hclust.R @@ -0,0 +1,17 @@ +## source this file and run compare() after reloading test/hclust.jl in Julia to check +## we get the same answers as R does. Clustering is tricky... + +compare <- function(method="complete") { + d <- scan("hclust.txt") + n <- sqrt(length(d)) + d <- matrix(d, n, n) + merge <- t(matrix(scan(paste("merge-", method, ".txt", sep="")), 2, n-1)) + height <- scan(paste("height-", method, ".txt", sep="")) + h <- hclust(as.dist(d), method) + cat("Heights: ", sum(abs(h$height - height)), "\n") + diffids <- merge[,1] != h$merge[,1] + print(cbind(1:(n-1), merge, h$merge, h$height)[diffids,]) +} + +for (m in c("single", "complete", "average")) + compare(m) diff --git a/test/hclust.jl b/test/hclust.jl new file mode 100644 index 00000000..ef28c186 --- /dev/null +++ b/test/hclust.jl @@ -0,0 +1,26 @@ +function test_hclust(d::Matrix, method=:complete) + h = hclust(d, method) + writedlm("merge-$method.txt", h.merge) + writedlm("height-$method.txt", h.height) + c = cutree(h, h=0) + writedlm("cutree-$method.txt", c) + d +end + +function gendist(N::Int) + d = randn(N,N) + d += d' + d +end + +d = gendist(1000) +writedlm("hclust.txt", d) + +for m in [:single, :complete, :average] + test_hclust(d, m) +end + +## now run the following command, and inspect the output +## Height should be 0 and matrix should have 0 rows + +# run(`Rscript test/hclust.R`) diff --git a/test/runtests.jl b/test/runtests.jl index 8fd19dfb..6ec1ee96 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,8 @@ tests = ["seeding", "affprop", "dbscan", "silhouette", - "varinfo"] + "varinfo", + "hclust"] println("Runing tests:") for t in tests @@ -13,4 +14,3 @@ for t in tests println("* $fp ...") include(fp) end -