Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA] All points membership vector for HDBSCAN #4800

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
8614cbd
Exemplar indices obtained
tarang-jain Jul 1, 2022
dbb7d48
Further additions to distance membership
tarang-jain Jul 5, 2022
8e25363
Cleanup dist membership vector
tarang-jain Jul 6, 2022
a6cb20f
Include changes
tarang-jain Jul 6, 2022
d04dff2
testing
tarang-jain Jul 6, 2022
2cfdae6
Further changes
tarang-jain Jul 6, 2022
f13e587
Further changes to distance based membership (clean build)
tarang-jain Jul 7, 2022
8421082
Reuse label map, replace unique_by_key with sorted_coo_to_csr
tarang-jain Jul 7, 2022
1ab28c0
Outlier based membership initial commit (unclean build)
tarang-jain Jul 11, 2022
66c815a
Restructuring functions (unclean build)
tarang-jain Jul 12, 2022
abff747
Intermediate commits
tarang-jain Jul 12, 2022
b979a5b
Corrections in exemplar computation and outlier membership
tarang-jain Jul 15, 2022
7bf26d7
All point membership vector all parts working
tarang-jain Jul 19, 2022
a7ab49e
Initial commit for Prediction Data class
tarang-jain Jul 20, 2022
bffd7ca
initial commit
tarang-jain Jul 25, 2022
ed59da5
Staged changes
tarang-jain Jul 25, 2022
3ed7ba3
Circling back to implementation without PredictionData
tarang-jain Jul 25, 2022
199695f
PredictionData finally added (errors in cython)
tarang-jain Jul 26, 2022
17cf426
Clean build with somee debug statements
tarang-jain Jul 26, 2022
e5bc693
Python API created and working
tarang-jain Jul 26, 2022
4f4136b
convert output dtype to cupy
tarang-jain Jul 26, 2022
b130ebe
Debugging exemplar_idx with large number of clusters
tarang-jain Jul 27, 2022
51cc6c8
correction cache function and moving to .cu file
tarang-jain Jul 28, 2022
336d638
Allow size_t as size of dataset (note: beware of limits as laterconve…
tarang-jain Jul 28, 2022
cca5f15
Save data (self.X_m), resolving compute-sanitizer errors
tarang-jain Jul 29, 2022
d5264f9
Resolving compute-santiizer error
tarang-jain Jul 29, 2022
441a397
Added gtest
tarang-jain Aug 1, 2022
2c00591
Further changes to pytest
tarang-jain Aug 1, 2022
a5cdf70
Add pytest, add support for distance metrics
tarang-jain Aug 2, 2022
b971aa5
Styling changes
tarang-jain Aug 2, 2022
26f7ab0
Resolved build failure
tarang-jain Aug 2, 2022
f46b3f4
Styling and copyright changes
tarang-jain Aug 2, 2022
2364455
Added prediction_data to get_param_names
tarang-jain Aug 2, 2022
291aa8d
Remove debug and sync statements from runner.h
tarang-jain Aug 9, 2022
cfeb6d3
Merge branch 'branch-22.10' of github.com:rapidsai/cuml into fea-all-…
tarang-jain Aug 10, 2022
e9c6ca4
Updated docs for CI failure
tarang-jain Aug 10, 2022
dfca0e4
Docs changes for failing CI
tarang-jain Aug 11, 2022
daed99b
Rename namespace to Predict
tarang-jain Aug 11, 2022
0d311ea
some changes after PR review
tarang-jain Aug 11, 2022
0a577e5
debugging failing gtest
tarang-jain Aug 11, 2022
589194a
Updates after PR changes (failing GTest handled)
tarang-jain Aug 12, 2022
ca81e87
Merge branch 'branch-22.10' of github.com:rapidsai/cuml into fea-all-…
tarang-jain Aug 12, 2022
43de057
Merge branch 'rapidsai:branch-22.10' into fea-all-points-membership-v…
tarang-jain Aug 12, 2022
dc8cbb4
Adding new separate hdbscan() function for prediction_data aftter PR …
tarang-jain Aug 15, 2022
db324f4
Merge branch 'fea-all-points-membership-vector-hdbscan' of github.com…
tarang-jain Aug 15, 2022
4f49a7d
Updates to docs
tarang-jain Aug 16, 2022
1768625
Update python docs
tarang-jain Aug 16, 2022
7e8b8e0
include count header for failing CI
tarang-jain Aug 16, 2022
48b84c5
Fixing bug related to unconverted labels
tarang-jain Aug 19, 2022
871d490
Fixing copyright issues
tarang-jain Aug 23, 2022
00b39aa
Merge branch 'branch-22.10' of github.com:rapidsai/cuml into fea-all-…
tarang-jain Aug 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,8 @@ if(BUILD_CUML_CPP_LIBRARY)
src/genetic/program.cu
src/genetic/node.cu
src/hdbscan/hdbscan.cu
src/hdbscan/condensed_hierarchy.cu)
src/hdbscan/condensed_hierarchy.cu
src/hdbscan/prediction_data.cu)
endif()

if(all_algo OR holtwinters_algo)
Expand Down
103 changes: 103 additions & 0 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,79 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {

template class CondensedHierarchy<int, float>;

/**
* Container object for computing and storing intermediate information needed later for computing
* membership vectors and approximate_predict.
* @tparam value_idx
* @tparam value_t
*/
template <typename value_idx, typename value_t>
class PredictionData {
public:
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n)
: handle(handle_),
exemplar_idx(0, handle.get_stream()),
exemplar_label_offsets(0, handle.get_stream()),
n_selected_clusters(0),
selected_clusters(0, handle.get_stream()),
deaths(0, handle.get_stream()),
n_exemplars(0),
n_rows(m),
n_cols(n)
{
}
size_t n_rows;
size_t n_cols;

// Using getters here, making the members private and forcing
// consistent state with the constructor. This should make
// it much easier to use / debug.
value_idx get_n_exemplars() { return n_exemplars; }
value_idx get_n_selected_clusters() { return n_selected_clusters; }
value_idx* get_exemplar_idx() { return exemplar_idx.data(); }
value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
value_idx* get_selected_clusters() { return selected_clusters.data(); }
value_t* get_deaths() { return deaths.data(); }

/**
* Resize buffers to the required sizes for storing data
* @param handle raft handle for ordering cuda operations
* @param n_exemplars_ number of exemplar points
* @param n_selected_clusters_ number of clusters selected
*/
void allocate(const raft::handle_t& handle,
value_idx n_exemplars_,
value_idx n_selected_clusters_);

/**
* Resize buffers for cluster deaths to n_clusters
* @param handle raft handle for ordering cuda operations
* @param n_clusters_
*/
void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
{
deaths.resize(n_clusters_, handle.get_stream());
}

private:
const raft::handle_t& handle;
rmm::device_uvector<value_idx> exemplar_idx;
rmm::device_uvector<value_idx> exemplar_label_offsets;
value_idx n_exemplars;
value_idx n_selected_clusters;
rmm::device_uvector<value_idx> selected_clusters;
rmm::device_uvector<value_t> deaths;
};

template class PredictionData<int, float>;

void build_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);

}; // namespace Common
}; // namespace HDBSCAN

Expand Down Expand Up @@ -338,6 +411,29 @@ void hdbscan(const raft::handle_t& handle,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out);

/**
* Executes HDBSCAN clustering on an mxn-dimensional input array, X and builds the PredictionData
* object which computes and stores information needed later for prediction algorithms.
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
* @param prediction_data_ struct for storing computing and storing information to be used during
* prediction
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out,
HDBSCAN::Common::PredictionData<int, float>& prediction_data_);

void build_condensed_hierarchy(const raft::handle_t& handle,
const int* children,
const float* delta,
Expand All @@ -359,4 +455,11 @@ void _extract_clusters(const raft::handle_t& handle,
bool allow_single_cluster,
int max_cluster_size,
float cluster_selection_epsilon);

void _all_points_membership_vectors(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
HDBSCAN::Common::PredictionData<int, float>& prediction_data,
float* membership_vec,
const float* X,
raft::distance::DistanceType metric);
} // END namespace ML
90 changes: 90 additions & 0 deletions cpp/src/hdbscan/detail/kernels/soft_clustering.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace ML {
namespace HDBSCAN {
namespace detail {
namespace Predict {

template <typename value_idx, typename value_t, int tpb = 256>
__global__ void merge_height_kernel(value_t* heights,
value_t* lambdas,
value_idx* index_into_children,
value_idx* parents,
size_t m,
value_idx n_selected_clusters,
value_idx* selected_clusters)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < value_idx(m * n_selected_clusters)) {
value_idx row = idx / n_selected_clusters;
value_idx col = idx % n_selected_clusters;
value_idx right_cluster = selected_clusters[col];
value_idx left_cluster = parents[index_into_children[row]];
bool took_right_parent = false;
bool took_left_parent = false;
value_idx last_cluster;

while (left_cluster != right_cluster) {
if (left_cluster > right_cluster) {
took_left_parent = true;
last_cluster = left_cluster;
left_cluster = parents[index_into_children[left_cluster]];
} else {
took_right_parent = true;
last_cluster = right_cluster;
right_cluster = parents[index_into_children[right_cluster]];
}
}

if (took_left_parent && took_right_parent) {
heights[idx] = lambdas[index_into_children[last_cluster]];
}

else {
heights[idx] = lambdas[index_into_children[row]];
}
}
}

template <typename value_idx, typename value_t>
__global__ void prob_in_some_cluster_kernel(value_t* heights,
value_t* height_argmax,
value_t* deaths,
value_idx* index_into_children,
value_idx* selected_clusters,
value_t* lambdas,
value_t* prob_in_some_cluster,
value_idx n_selected_clusters,
value_idx n_leaves,
size_t m)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < (value_idx)m) {
value_t max_lambda = max(lambdas[index_into_children[idx]],
deaths[selected_clusters[(int)height_argmax[idx]] - n_leaves]);
prob_in_some_cluster[idx] =
heights[idx * n_selected_clusters + (int)height_argmax[idx]] / max_lambda;
return;
}
}

}; // namespace Predict
}; // namespace detail
}; // namespace HDBSCAN
}; // namespace ML
Loading