-
Notifications
You must be signed in to change notification settings - Fork 120
/
Copy pathdbscan.jl
187 lines (164 loc) · 7.52 KB
/
dbscan.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# DBSCAN Clustering
#
"""
DbscanCluster
DBSCAN cluster, part of [`DbscanResult`](@ref) returned by [`dbscan`](@ref) function.
## Fields
- `size::Int`: number of points in a cluster (core + boundary)
- `core_indices::Vector{Int}`: indices of points in the cluster *core*, a.k.a. *seeds*
(have at least `min_neighbors` neighbors in the cluster)
- `boundary_indices::Vector{Int}`: indices of the cluster points outside of *core*
"""
struct DbscanCluster
size::Int
core_indices::Vector{Int}
boundary_indices::Vector{Int}
end
"""
DbscanResult <: ClusteringResult
The output of [`dbscan`](@ref) function.
## Fields
- `clusters::Vector{DbscanCluster}`: clusters, length *K*
- `seeds::Vector{Int}`: indices of the first points of each cluster's *core*, length *K*
- `counts::Vector{Int}`: cluster sizes (number of assigned points), length *K*
- `assignments::Vector{Int}`: vector of clusters indices, where each point was assigned to, length *N*
"""
struct DbscanResult <: ClusteringResult
clusters::Vector{DbscanCluster}
seeds::Vector{Int}
counts::Vector{Int}
assignments::Vector{Int}
function DbscanResult(clusters::AbstractVector{DbscanCluster}, num_points::Integer)
assignments = zeros(Int, num_points)
for (i, clu) in enumerate(clusters)
assignments[clu.core_indices] .= i
assignments[clu.boundary_indices] .= i
end
new(clusters,
[c.core_indices[1] for c in clusters],
[c.size for c in clusters],
assignments)
end
end
"""
dbscan(points::AbstractMatrix, radius::Real;
[metric=Euclidean()],
[min_neighbors=1], [min_cluster_size=1],
[nntree_kwargs...]) -> DbscanResult
Cluster `points` using the DBSCAN (Density-Based Spatial Clustering of
Applications with Noise) algorithm.
## Arguments
- `points`: when `metric` is specified, the *d×n* matrix, where
each column is a *d*-dimensional coordinate of a point;
when `metric=nothing`, the *n×n* matrix of pairwise distances between the points
- `radius::Real`: neighborhood radius; points within this distance
are considered neighbors
Optional keyword arguments to control the algorithm:
- `metric` (defaults to `Euclidean()`): the points distance metric to use,
`nothing` means `points` is the *n×n* precalculated distance matrix
- `min_neighbors::Integer` (defaults to 1): the minimal number of neighbors
required to assign a point to a cluster "core"
- `min_cluster_size::Integer` (defaults to 1): the minimal number of points in
a cluster; cluster candidates with fewer points are discarded
- `nntree_kwargs...`: parameters (like `leafsize`) for the `KDTree` constructor
## Example
```julia
points = randn(3, 10000)
# DBSCAN clustering, clusters with less than 20 points will be discarded:
clustering = dbscan(points, 0.05, min_neighbors = 3, min_cluster_size = 20)
```
## References:
* Martin Ester, Hans-Peter Kriegel, Jörg Sander, and Xiaowei Xu,
*"A density-based algorithm for discovering clusters
in large spatial databases with noise"*, KDD-1996, pp. 226--231.
* Erich Schubert, Jörg Sander, Martin Ester, Hans Peter Kriegel, and
Xiaowei Xu, *"DBSCAN Revisited, Revisited: Why and How You Should
(Still) Use DBSCAN"*, ACM Transactions on Database Systems,
Vol.42(3)3, pp. 1--21, https://doi.org/10.1145/3068335
"""
function dbscan(points::AbstractMatrix, radius::Real;
metric = Euclidean(),
min_neighbors::Integer = 1, min_cluster_size::Integer = 1,
nntree_kwargs...)
0 <= radius || throw(ArgumentError("radius $radius must be ≥ 0"))
if metric !== nothing
# points are point coordinates
dim, num_points = size(points)
num_points <= dim && throw(ArgumentError("points has $dim rows and $num_points columns. Must be a D x N matric with D < N"))
kdtree = KDTree(points, metric; nntree_kwargs...)
data = (kdtree, points)
else
# points is a distance matrix
num_points = size(points, 1)
size(points, 2) == num_points || throw(ArgumentError("When metric=nothing, points must be a square distance matrix ($(size(points)) given)."))
num_points >= 2 || throw(ArgumentError("At least two data points are required ($num_points given)."))
data = points
end
clusters = _dbscan(data, num_points, radius, min_neighbors, min_cluster_size)
return DbscanResult(clusters, num_points)
end
# An implementation of DBSCAN algorithm that keeps track of both the core and boundary points
function _dbscan(data::Union{AbstractMatrix, Tuple{NNTree, AbstractMatrix}},
num_points::Integer, radius::Real,
min_neighbors::Integer, min_cluster_size::Integer)
1 <= min_neighbors || throw(ArgumentError("min_neighbors $min_neighbors must be ≥ 1"))
1 <= min_cluster_size || throw(ArgumentError("min_cluster_size $min_cluster_size must be ≥ 1"))
clusters = Vector{DbscanCluster}()
visited = fill(false, num_points)
cluster_mask = Vector{Bool}(undef, num_points)
core_mask = similar(cluster_mask)
to_explore = Vector{Int}()
neighbors = Vector{Int}()
@inbounds for i = 1:num_points
visited[i] && continue
@assert isempty(to_explore)
push!(to_explore, i) # start a new cluster
fill!(core_mask, false)
fill!(cluster_mask, false)
# depth-first search to find all points in the cluster
while !isempty(to_explore)
point = popfirst!(to_explore)
visited[point] && continue
visited[point] = true
_dbscan_region_query!(neighbors, data, point, radius)
cluster_mask[neighbors] .= true # mark as candidates
# if a point has enough neighbors, it is a 'core' point and its neighbors are added to the to_explore list
if length(neighbors) >= min_neighbors
core_mask[point] = true
for j in neighbors
visited[j] || push!(to_explore, j)
end
end
empty!(neighbors)
end
# if the cluster has core and is large enough, it is accepted
if any(core_mask) && (cluster_size = sum(cluster_mask)) >= min_cluster_size
core = Vector{Int}()
boundary = Vector{Int}()
for (i, (is_cluster, is_core)) in enumerate(zip(cluster_mask, core_mask))
@assert is_core && is_cluster || !is_core # core is always in a cluster
is_cluster && push!(ifelse(is_core, core, boundary), i)
end
@assert !isempty(core)
push!(clusters, DbscanCluster(cluster_size, core, boundary))
end
end
return clusters
end
# distance matrix-based
function _dbscan_region_query!(neighbors::AbstractVector, dists::AbstractMatrix,
point::Integer, radius::Real)
empty!(neighbors)
for (i, dist) in enumerate(view(dists, :, point))
(dist <= radius) && push!(neighbors, i)
end
return neighbors
end
# NN-tree based
function _dbscan_region_query!(neighbors::AbstractVector,
nntree_and_points::Tuple{NNTree, AbstractMatrix},
point::Integer, radius::Real)
nntree, points = nntree_and_points
empty!(neighbors)
return append!(neighbors, inrange(nntree, view(points, :, point), radius))
end