Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Replace UMAP functionality dependency on nvgraph with RAFT Spectral Clustering #2500

Merged
merged 12 commits into from
Jul 17, 2020
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- PR #2394: Adding cosine & correlation distance for KNN
- PR #2392: PCA can accept sparse inputs, and sparse prim for computing covariance
- PR #2465: Support pandas 1.0+
- PR #2500: Replace UMAP functionality dependency on nvgraph with RAFT Spectral Clustering
- PR #2520: TfidfVectorizer estimator

## Improvements
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ else(DEFINED ENV{RAFT_PATH})

ExternalProject_Add(raft
GIT_REPOSITORY https://github.com/rapidsai/raft.git
GIT_TAG b58f97f2b5382a633e43daec31b26adf52e19a3b
GIT_TAG 6391b63cd32f04ac4912edecf62ff2865c529cd9
PREFIX ${RAFT_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND "")

# Redefining RAFT_DIR so it coincides with the one inferred by env variable.
set(RAFT_DIR ${RAFT_DIR}/src/raft/ CACHE STRING "Path to RAFT repo")
set(RAFT_DIR ${RAFT_DIR}/src/raft/)
aschaffer marked this conversation as resolved.
Show resolved Hide resolved
endif(DEFINED ENV{RAFT_PATH})


Expand Down
89 changes: 53 additions & 36 deletions cpp/src_prims/sparse/spectral.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include "coo.cuh"
#include "cusparse_wrappers.h"

#include <raft/spectral/partition.hpp>

namespace MLCommon {
namespace Spectral {

Expand Down Expand Up @@ -147,53 +149,68 @@ void fit_embedding(cusparseHandle_t handle, int *rows, int *cols, T *vals,
int nnz, int n, int n_components, T *out,
std::shared_ptr<deviceAllocator> d_alloc,
cudaStream_t stream) {
nvgraphHandle_t grapHandle;
cudaDataType_t edge_dimT = CUDA_R_32F;
NVGRAPH_CHECK(nvgraphCreate(&grapHandle));

device_buffer<int> src_offsets(d_alloc, stream, n + 1);
device_buffer<int> dst_cols(d_alloc, stream, nnz);
device_buffer<T> dst_vals(d_alloc, stream, nnz);
coo2csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(),
dst_vals.data(), d_alloc, stream);

nvgraphCSRTopology32I_st CSR_input;
CSR_input.destination_indices = dst_cols.data();
CSR_input.nedges = nnz;
CSR_input.nvertices = n;
CSR_input.source_offsets = src_offsets.data();

int weight_index = 0;

device_buffer<T> eigVals(d_alloc, stream, n_components + 1);
device_buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
device_buffer<int> labels(d_alloc, stream, n);

CUDA_CHECK(cudaStreamSynchronize(stream));

// Spectral clustering parameters
struct SpectralClusteringParameter clustering_params;
clustering_params.n_clusters = n_components + 1;
clustering_params.n_eig_vects = n_components + 1;
clustering_params.algorithm = NVGRAPH_BALANCED_CUT_LANCZOS;
clustering_params.evs_tolerance = 0.01f;
clustering_params.evs_max_iter = 0;
clustering_params.kmean_tolerance = 0.0f;
clustering_params.kmean_max_iter = 1;

nvgraphGraphDescr_t graph;
NVGRAPH_CHECK(nvgraphCreateGraphDescr(grapHandle, &graph));
NVGRAPH_CHECK(nvgraphSetGraphStructure(grapHandle, graph, (void *)&CSR_input,
NVGRAPH_CSR_32));
NVGRAPH_CHECK(nvgraphAllocateEdgeData(grapHandle, graph, 1, &edge_dimT));
NVGRAPH_CHECK(nvgraphSetEdgeData(grapHandle, graph, (void *)vals, 0));

NVGRAPH_CHECK(nvgraphSpectralClustering(grapHandle, graph, weight_index,
&clustering_params, labels.data(),
eigVals.data(), eigVecs.data()));

NVGRAPH_CHECK(nvgraphDestroyGraphDescr(grapHandle, graph));
NVGRAPH_CHECK(nvgraphDestroy(grapHandle));
//raft spectral clustering:
//
using index_type = int;
using value_type = T;

raft::handle_t r_handle;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@divyegala JFYI. Assuming this PR goes first before yours, we'll have to update this logic to use the raft handle that comes inside our cumlHandle.

r_handle.set_stream(stream);

//TODO: r_handle to be passed as argument;
aschaffer marked this conversation as resolved.
Show resolved Hide resolved
//this will be fixed in a separate refactoring PR;

index_type *ro = src_offsets.data();
index_type *ci = dst_cols.data();
value_type *vs = dst_vals.data();

raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
r_handle, ro, ci, vs, n, nnz};

index_type neigvs = n_components + 1;
index_type maxiter = 4000; //default reset value (when set to 0);
value_type tol = 0.01;
index_type restart_iter = 15 + neigvs; //what cugraph is using
auto t_exe_p = thrust::cuda::par.on(stream);
using thrust_exe_policy_t = decltype(t_exe_p);

raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
restart_iter, tol};

raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};

//cluster computation here is irrelevant,
//hence define a no-op such solver to
//feed partition():
//
struct no_op_cluster_solver_t {
using index_type_t = index_type;
using size_type_t = index_type;
using value_type_t = value_type;

std::pair<value_type_t, index_type_t> solve(
handle_t const &handle, thrust_exe_policy_t t_exe_policy,
size_type_t n_obs_vecs, size_type_t dim,
value_type_t const *__restrict__ obs,
index_type_t *__restrict__ codes) const {
return std::make_pair<value_type_t, index_type_t>(0, 0);
}
};

raft::spectral::partition(r_handle, t_exe_p, r_csr_m, eig_solver,
no_op_cluster_solver_t{}, labels.data(),
eigVals.data(), eigVecs.data());

MLCommon::copy<T>(out, eigVecs.data() + n, n * n_components, stream);

Expand Down