Skip to content

Commit

Permalink
approximate_predict function for HDBSCAN (rapidsai#4872)
Browse files Browse the repository at this point in the history
PR for HDBSCAN approximate_predict

- [x] Building cluster_map
- [x] Modifying PredictionData class
- [x] Obtaining nearest neighbor in MR space
- [x] Computing probability
- [x] Tests

Closes rapidsai#4877
Closes rapidsai#4448

Authors:
  - Tarang Jain (https://github.com/tarang-jain)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: rapidsai#4872
  • Loading branch information
tarang-jain authored Sep 3, 2022
1 parent cd33ec3 commit 58b63d1
Show file tree
Hide file tree
Showing 15 changed files with 4,912 additions and 250 deletions.
51 changes: 37 additions & 14 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ template class CondensedHierarchy<int, float>;

/**
* Container object for computing and storing intermediate information needed later for computing
* membership vectors and approximate_predict.
* membership vectors and approximate predict. Users are only expected to create an instance of this
* object, the hdbscan method will do the rest.
* @tparam value_idx
* @tparam value_t
*/
Expand All @@ -322,6 +323,8 @@ class PredictionData {
n_selected_clusters(0),
selected_clusters(0, handle.get_stream()),
deaths(0, handle.get_stream()),
core_dists(m, handle.get_stream()),
index_into_children(0, handle.get_stream()),
n_exemplars(0),
n_rows(m),
n_cols(n)
Expand All @@ -339,21 +342,26 @@ class PredictionData {
value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
value_idx* get_selected_clusters() { return selected_clusters.data(); }
value_t* get_deaths() { return deaths.data(); }
value_t* get_core_dists() { return core_dists.data(); }
value_idx* get_index_into_children() { return index_into_children.data(); }

/**
* Resize buffers to the required sizes for storing data
* @param handle raft handle for ordering cuda operations
* @param n_exemplars_ number of exemplar points
* @param n_selected_clusters_ number of clusters selected
* Resizes the buffers in the PredictionData object.
*
* @param[in] handle raft handle for resource reuse
* @param[in] n_exemplars_ number of exemplar points
* @param[in] n_selected_clusters_ number of selected clusters in the final clustering
* @param[in] n_edges_ number of edges in the condensed hierarchy
*/
void allocate(const raft::handle_t& handle,
value_idx n_exemplars_,
value_idx n_selected_clusters_);
value_idx n_selected_clusters_,
value_idx n_edges_);

/**
* Resize buffers for cluster deaths to n_clusters
* @param handle raft handle for ordering cuda operations
* @param n_clusters_
* @param n_clusters_ number of clusters
*/
void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
{
Expand All @@ -368,6 +376,8 @@ class PredictionData {
value_idx n_selected_clusters;
rmm::device_uvector<value_idx> selected_clusters;
rmm::device_uvector<value_t> deaths;
rmm::device_uvector<value_t> core_dists;
rmm::device_uvector<value_idx> index_into_children;
};

template class PredictionData<int, float>;
Expand Down Expand Up @@ -412,7 +422,7 @@ void hdbscan(const raft::handle_t& handle,
HDBSCAN::Common::hdbscan_output<int, float>& out);

/**
* Executes HDBSCAN clustering on an mxn-dimensional input array, X and builds the PredictionData
* Executes HDBSCAN clustering on an mxn-dimensional input array, X, then builds the PredictionData
* object which computes and stores information needed later for prediction algorithms.
*
* @param[in] handle raft handle for resource reuse
Expand Down Expand Up @@ -456,10 +466,23 @@ void _extract_clusters(const raft::handle_t& handle,
int max_cluster_size,
float cluster_selection_epsilon);

void _all_points_membership_vectors(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
HDBSCAN::Common::PredictionData<int, float>& prediction_data,
float* membership_vec,
const float* X,
raft::distance::DistanceType metric);
void compute_all_points_membership_vectors(
const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
HDBSCAN::Common::PredictionData<int, float>& prediction_data,
const float* X,
raft::distance::DistanceType metric,
float* membership_vec);

void out_of_sample_predict(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
HDBSCAN::Common::PredictionData<int, float>& prediction_data,
const float* X,
int* labels,
const float* points_to_predict,
size_t n_prediction_points,
raft::distance::DistanceType metric,
int min_samples,
int* out_labels,
float* out_probabilities);
} // END namespace ML
102 changes: 102 additions & 0 deletions cpp/src/hdbscan/detail/kernels/predict.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace ML {
namespace HDBSCAN {
namespace detail {
namespace Predict {

template <typename value_idx, typename value_t>
__global__ void min_mutual_reachability_kernel(value_t* input_core_dists,
value_t* prediction_core_dists,
value_t* pairwise_dists,
value_idx* neighbor_indices,
size_t n_prediction_points,
value_idx neighborhood,
value_t* min_mr_dists,
value_idx* min_mr_indices)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < value_idx(n_prediction_points)) {
value_t min_mr_dist = std::numeric_limits<value_t>::max();
value_idx min_mr_ind = -1;
for (int i = 0; i < neighborhood; i++) {
value_t mr_dist = prediction_core_dists[idx];
if (input_core_dists[neighbor_indices[idx * neighborhood + i]] > mr_dist) {
mr_dist = input_core_dists[neighbor_indices[idx * neighborhood + i]];
}
if (pairwise_dists[idx * neighborhood + i] > mr_dist) {
mr_dist = pairwise_dists[idx * neighborhood + i];
}
if (min_mr_dist > mr_dist) {
min_mr_dist = mr_dist;
min_mr_ind = neighbor_indices[idx * neighborhood + i];
}
}
min_mr_dists[idx] = min_mr_dist;
min_mr_indices[idx] = min_mr_ind;
}
return;
}

template <typename value_idx, typename value_t>
__global__ void cluster_probability_kernel(value_idx* min_mr_indices,
value_t* prediction_lambdas,
value_idx* index_into_children,
value_idx* labels,
value_t* deaths,
value_idx* selected_clusters,
value_idx* parents,
value_t* lambdas,
value_idx n_leaves,
size_t n_prediction_points,
value_idx* predicted_labels,
value_t* cluster_probabilities)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < value_idx(n_prediction_points)) {
value_idx cluster_label = labels[min_mr_indices[idx]];

if (cluster_label >= 0 && selected_clusters[cluster_label] > n_leaves &&
lambdas[index_into_children[selected_clusters[cluster_label]]] < prediction_lambdas[idx]) {
predicted_labels[idx] = cluster_label;
} else if (cluster_label >= 0 && selected_clusters[cluster_label] == n_leaves) {
predicted_labels[idx] = cluster_label;
} else {
predicted_labels[idx] = -1;
}
if (predicted_labels[idx] >= 0) {
value_t max_lambda = deaths[selected_clusters[cluster_label] - n_leaves];
if (max_lambda > 0) {
cluster_probabilities[idx] =
(max_lambda < prediction_lambdas[idx] ? max_lambda : prediction_lambdas[idx]) /
max_lambda;
} else {
cluster_probabilities[idx] = 1.0;
}
} else {
cluster_probabilities[idx] = 0.0;
}
}
return;
}

}; // namespace Predict
}; // namespace detail
}; // namespace HDBSCAN
}; // namespace ML
Loading

0 comments on commit 58b63d1

Please sign in to comment.