Skip to content

Commit

Permalink
Updates to enable HDBSCAN (#208)
Browse files Browse the repository at this point in the history
Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #208
  • Loading branch information
cjnolet authored May 27, 2021
1 parent c00d3b2 commit d576348
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 141 deletions.
84 changes: 16 additions & 68 deletions cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,15 @@ class UnionFind {
};

/**
* Standard single-threaded agglomerative labeling on host. This should work
* well for smaller sizes of m. This is a C++ port of the original reference
* implementation of HDBSCAN.
* Agglomerative labeling on host. This has not been found to be a bottleneck
* in the algorithm. A parallel version of this can be done using a parallel
* variant of Kruskal's MST algorithm
* (ref http://cucis.ece.northwestern.edu/publications/pdf/HenPat12.pdf),
* which breaks apart the sorted MST results into overlapping subsets and
* independently runs Kruskal's algorithm on each subset, merging them back
* together into a single hierarchy when complete. Unfortunately,
* this is nontrivial and the speedup wouldn't be useful until this
* becomes a bottleneck.
*
* @tparam value_idx
* @tparam value_t
Expand All @@ -91,9 +97,8 @@ class UnionFind {
template <typename value_idx, typename value_t>
void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
const value_idx *cols, const value_t *data,
size_t nnz, value_idx *children,
rmm::device_uvector<value_t> &out_delta,
rmm::device_uvector<value_idx> &out_size) {
size_t nnz, value_idx *children, value_t *out_delta,
value_idx *out_size) {
auto d_alloc = handle.get_device_allocator();
auto stream = handle.get_stream();

Expand All @@ -103,8 +108,6 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
std::vector<value_idx> mst_dst_h(n_edges);
std::vector<value_t> mst_weights_h(n_edges);

printf("n_edges: %d\n", n_edges);

update_host(mst_src_h.data(), rows, n_edges, stream);
update_host(mst_dst_h.data(), cols, n_edges, stream);
update_host(mst_weights_h.data(), data, n_edges, stream);
Expand All @@ -113,12 +116,14 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,

std::vector<value_idx> children_h(n_edges * 2);
std::vector<value_idx> out_size_h(n_edges);
std::vector<value_t> out_delta_h(n_edges);

UnionFind<value_idx, value_t> U(nnz + 1);

for (value_idx i = 0; i < nnz; i++) {
value_idx a = mst_src_h[i];
value_idx b = mst_dst_h[i];
value_t delta = mst_weights_h[i];

value_idx aa = U.find(a);
value_idx bb = U.find(b);
Expand All @@ -127,72 +132,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,

children_h[children_idx] = aa;
children_h[children_idx + 1] = bb;
out_delta_h[i] = delta;
out_size_h[i] = U.size[aa] + U.size[bb];

U.perform_union(aa, bb);
}

out_size.resize(n_edges, stream);

printf("Finished dendrogram\n");

raft::update_device(children, children_h.data(), n_edges * 2, stream);
raft::update_device(out_size.data(), out_size_h.data(), n_edges, stream);
}

/**
* Parallel agglomerative labeling. This amounts to a parallel Kruskal's
* MST algorithm, which breaks apart the sorted MST results into overlapping
* subsets and independently runs Kruskal's algorithm on each subset,
* merging them back together into a single hierarchy when complete.
*
* This outputs the same format as the reference HDBSCAN, but as 4 separate
* arrays, rather than a single 2D array.
*
* Reference: http://cucis.ece.northwestern.edu/publications/pdf/HenPat12.pdf
*
* TODO: Investigate potential for the following end-to-end single-hierarchy batching:
* For each of k (independent) batches over the input:
* - Sample n elements from X
* - Compute mutual reachability graph of batch
* - Construct labels from batch
*
* The sampled datasets should have some overlap across batches. This will
* allow for the cluster hierarchies to be merged. Being able to batch
* will reduce the memory cost so that the full n^2 pairwise distances
* don't need to be materialized in memory all at once.
*
* @tparam value_idx
* @tparam value_t
* @param[in] handle the raft handle
* @param[in] rows src edges of the sorted MST
* @param[in] cols dst edges of the sorted MST
* @param[in] nnz the number of edges in the sorted MST
* @param[out] out_src parents of output
* @param[out] out_dst children of output
* @param[out] out_delta distances of output
* @param[out] out_size cluster sizes of output
* @param[in] k_folds number of folds for parallelizing label step
*/
template <typename value_idx, typename value_t>
void build_dendrogram_device(const handle_t &handle, const value_idx *rows,
const value_idx *cols, const value_t *data,
value_idx nnz, value_idx *children,
value_t *out_delta, value_idx *out_size,
value_idx k_folds) {
ASSERT(k_folds < nnz / 2, "k_folds must be < n_edges / 2");
/**
* divide (sorted) mst coo into overlapping subsets. Easiest way to do this is to
* break it into k-folds and iterate through two folds at a time.
*/

// 1. Generate ranges for the overlapping subsets

// 2. Run union-find in parallel for each pair of folds

// 3. Sort individual label hierarchies

// 4. Merge label hierarchies together
raft::update_device(out_size, out_size_h.data(), n_edges, stream);
raft::update_device(out_delta, out_delta_h.data(), n_edges, stream);
}

template <typename value_idx>
Expand Down
76 changes: 23 additions & 53 deletions cpp/include/raft/sparse/hierarchy/detail/mst.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <raft/cudart_utils.h>
#include <raft/cuda_utils.cuh>

#include <raft/sparse/op/sort.h>
#include <raft/mr/device/buffer.hpp>
#include <raft/sparse/mst/mst.cuh>
#include <raft/sparse/selection/connect_components.cuh>
Expand All @@ -35,29 +36,6 @@ namespace raft {
namespace hierarchy {
namespace detail {

/**
* Sorts a COO by its weight
* @tparam value_idx
* @tparam value_t
* @param[inout] rows source edges
* @param[inout] cols dest edges
* @param[inout] data edge weights
* @param[in] nnz number of edges in edge list
* @param[in] stream cuda stream for which to order cuda operations
*/
template <typename value_idx, typename value_t>
void sort_coo_by_data(value_idx *rows, value_idx *cols, value_t *data,
value_idx nnz, cudaStream_t stream) {
thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);

auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));

thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
first);
}

template <typename value_idx, typename value_t>
void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
Expand Down Expand Up @@ -95,19 +73,20 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
* @param[inout] color the color labels array returned from the mst invocation
* @return updated MST edge list
*/
template <typename value_idx, typename value_t>
template <typename value_idx, typename value_t, typename red_op>
void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
raft::Graph_COO<value_idx, value_idx, value_t> &msf,
size_t m, size_t n, value_idx *color,
red_op reduction_op,
raft::distance::DistanceType metric =
raft::distance::DistanceType::L2SqrtExpanded) {
auto d_alloc = handle.get_device_allocator();
auto stream = handle.get_stream();

raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);

raft::linkage::connect_components<value_idx, value_t>(handle, connected_edges,
X, color, m, n);
raft::linkage::connect_components<value_idx, value_t>(
handle, connected_edges, X, color, m, n, reduction_op);

rmm::device_uvector<value_idx> indptr2(m + 1, stream);
raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
Expand Down Expand Up @@ -147,38 +126,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
* @param[in] max_iter maximum iterations to run knn graph connection. This
* argument is really just a safeguard against the potential for infinite loops.
*/
template <typename value_idx, typename value_t>
template <typename value_idx, typename value_t, typename red_op>
void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
const value_idx *indptr, const value_idx *indices,
const value_t *pw_dists, size_t m, size_t n,
rmm::device_uvector<value_idx> &mst_src,
rmm::device_uvector<value_idx> &mst_dst,
rmm::device_uvector<value_t> &mst_weight,
const size_t nnz,
value_idx *mst_src, value_idx *mst_dst,
value_t *mst_weight, value_idx *color, size_t nnz,
red_op reduction_op,
raft::distance::DistanceType metric =
raft::distance::DistanceType::L2SqrtExpanded,
int max_iter = 10) {
auto d_alloc = handle.get_device_allocator();
auto stream = handle.get_stream();

rmm::device_uvector<value_idx> color(m, stream);

// We want to have MST initialize colors on first call.
auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
handle, indptr, indices, pw_dists, (value_idx)m, nnz, color.data(), stream,
false, true);
handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
true);

int iters = 1;
int n_components =
linkage::get_n_components(color.data(), m, d_alloc, stream);
int n_components = linkage::get_n_components(color, m, d_alloc, stream);

while (n_components > 1 && iters < max_iter) {
connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n,
color.data());
connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
reduction_op);

iters++;

n_components = linkage::get_n_components(color.data(), m, d_alloc, stream);
n_components = linkage::get_n_components(color, m, d_alloc, stream);
}

/**
Expand All @@ -189,7 +164,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
* 1. There is a bug in this code somewhere
* 2. Either the given KNN graph wasn't generated from X or the same metric is not being used
* to generate the 1-nn (currently only L2SqrtExpanded is supported).
* 3. max_iter was not large enough to connect the graph.
* 3. max_iter was not large enough to connect the graph (less likely).
*
* Note that a KNN graph generated from 50 random isotropic balls (with significant overlap)
* was able to be connected in a single iteration.
Expand All @@ -201,20 +176,15 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
" or increase 'max_iter'",
max_iter);

sort_coo_by_data(mst_coo.src.data(), mst_coo.dst.data(),
mst_coo.weights.data(), mst_coo.n_edges, stream);

// TODO: be nice if we could pass these directly into the MST
mst_src.resize(mst_coo.n_edges, stream);
mst_dst.resize(mst_coo.n_edges, stream);
mst_weight.resize(mst_coo.n_edges, stream);
raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
mst_coo.weights.data(), mst_coo.n_edges,
stream);

raft::copy_async(mst_src.data(), mst_coo.src.data(), mst_coo.n_edges, stream);
raft::copy_async(mst_dst.data(), mst_coo.dst.data(), mst_coo.n_edges, stream);
raft::copy_async(mst_weight.data(), mst_coo.weights.data(), mst_coo.n_edges,
stream);
raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
}

}; // namespace detail
}; // namespace hierarchy
}; // namespace raft
}; // namespace raft
15 changes: 9 additions & 6 deletions cpp/include/raft/sparse/hierarchy/single_linkage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
detail::get_distance_graph<value_idx, value_t, dist_type>(
handle, X, m, n, metric, indptr, indices, pw_dists, c);

rmm::device_uvector<value_idx> mst_rows(EMPTY, stream);
rmm::device_uvector<value_idx> mst_cols(EMPTY, stream);
rmm::device_uvector<value_t> mst_data(EMPTY, stream);
rmm::device_uvector<value_idx> mst_rows(m - 1, stream);
rmm::device_uvector<value_idx> mst_cols(m - 1, stream);
rmm::device_uvector<value_t> mst_data(m - 1, stream);

/**
* 2. Construct MST, sorted by weights
*/
rmm::device_uvector<value_idx> color(m, stream);
raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
detail::build_sorted_mst<value_idx, value_t>(
handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, mst_rows,
mst_cols, mst_data, indices.size(), metric);
handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
indices.size(), op, metric);

pw_dists.release();

Expand All @@ -93,7 +96,7 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
// Create dendrogram
detail::build_dendrogram_host<value_idx, value_t>(
handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
out->children, out_delta, out_size);
out->children, out_delta.data(), out_size.data());
detail::extract_flattened_clusters(handle, out->labels, out->children,
n_clusters, m);

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/sparse/op/reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ void max_duplicates(const raft::handle_t &handle,

compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);

thrust::exclusive_scan(exec_policy, diff.data(), diff.data() + diff.size(),
diff.data());
thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
diff.data() + diff.size(), diff.data());

// compute final size
value_idx size = 0;
Expand Down
23 changes: 23 additions & 0 deletions cpp/include/raft/sparse/op/sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,29 @@ void coo_sort(COO<T> *const in,
coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
in->vals(), d_alloc, stream);
}

/**
* Sorts a COO by its weight
* @tparam value_idx
* @tparam value_t
* @param[inout] rows source edges
* @param[inout] cols dest edges
* @param[inout] data edge weights
* @param[in] nnz number of edges in edge list
* @param[in] stream cuda stream for which to order cuda operations
*/
template <typename value_idx, typename value_t>
void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
value_idx nnz, cudaStream_t stream) {
thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);

auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));

thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
first);
}
}; // namespace op
}; // end NAMESPACE sparse
}; // end NAMESPACE raft
Loading

0 comments on commit d576348

Please sign in to comment.