diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh index 8664fb0a65..24595ff931 100644 --- a/cpp/src/dbscan/dbscan.cuh +++ b/cpp/src/dbscan/dbscan.cuh @@ -180,7 +180,8 @@ void dbscanFitImpl(const raft::handle_t& handle, algo_ccl, NULL, batch_size, - stream); + stream, + metric); CUML_LOG_DEBUG("Workspace size: %lf MB", (double)workspaceSize * 1e-6); @@ -200,7 +201,8 @@ void dbscanFitImpl(const raft::handle_t& handle, algo_ccl, workspace.data(), batch_size, - stream); + stream, + metric); } } // namespace Dbscan diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh index e3e8dcd8aa..acef6c9785 100644 --- a/cpp/src/dbscan/runner.cuh +++ b/cpp/src/dbscan/runner.cuh @@ -114,7 +114,8 @@ std::size_t run(const raft::handle_t& handle, int algo_ccl, void* workspace, std::size_t batch_size, - cudaStream_t stream) + cudaStream_t stream, + raft::distance::DistanceType metric) { const std::size_t align = 256; Index_ n_batches = raft::ceildiv((std::size_t)n_owned_rows, batch_size); @@ -191,7 +192,7 @@ std::size_t run(const raft::handle_t& handle, CUML_LOG_DEBUG("--> Computing vertex degrees"); raft::common::nvtx::push_range("Trace::Dbscan::VertexDeg"); VertexDeg::run( - handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream); + handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream, metric); raft::common::nvtx::pop_range(); CUML_LOG_DEBUG("--> Computing core point mask"); @@ -219,7 +220,7 @@ std::size_t run(const raft::handle_t& handle, CUML_LOG_DEBUG("--> Computing vertex degrees"); raft::common::nvtx::push_range("Trace::Dbscan::VertexDeg"); VertexDeg::run( - handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream); + handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream, metric); raft::common::nvtx::pop_range(); } raft::update_host(&curradjlen, vd + n_points, 1, stream); diff --git a/cpp/src/dbscan/vertexdeg/algo.cuh b/cpp/src/dbscan/vertexdeg/algo.cuh index b4458e9008..8cb13e8c4d 100644 --- a/cpp/src/dbscan/vertexdeg/algo.cuh +++ b/cpp/src/dbscan/vertexdeg/algo.cuh @@ -18,7 +18,10 @@ #include #include +#include +#include #include +#include #include "pack.h" @@ -35,19 +38,70 @@ void launcher(const raft::handle_t& handle, Pack data, index_t start_vertex_id, index_t batch_size, - cudaStream_t stream) + cudaStream_t stream, + raft::distance::DistanceType metric) { data.resetArray(stream, batch_size + 1); ASSERT(sizeof(index_t) == 4 || sizeof(index_t) == 8, "index_t should be 4 or 8 bytes"); - index_t m = data.N; - index_t n = min(data.N - start_vertex_id, batch_size); - index_t k = data.D; - value_t eps2 = data.eps * data.eps; + index_t m = data.N; + index_t n = min(data.N - start_vertex_id, batch_size); + index_t k = data.D; + value_t eps2; - raft::spatial::knn::epsUnexpL2SqNeighborhood( - data.adj, data.vd, data.x, data.x + start_vertex_id * k, m, n, k, eps2, stream); + if (metric == raft::distance::DistanceType::CosineExpanded) { + rmm::device_uvector rowNorms(m, stream); + + raft::linalg::rowNorm(rowNorms.data(), + data.x, + k, + m, + raft::linalg::NormType::L2Norm, + true, + stream, + [] __device__(value_t in) { return sqrtf(in); }); + + /* Cast away constness because the output matrix for normalization cannot be of const type. + * Input matrix will be modified due to normalization. + */ + raft::linalg::matrixVectorOp( + const_cast(data.x), + data.x, + rowNorms.data(), + k, + m, + true, + true, + [] __device__(value_t mat_in, value_t vec_in) { return mat_in / vec_in; }, + stream); + + eps2 = 2 * data.eps; + + raft::spatial::knn::epsUnexpL2SqNeighborhood( + data.adj, data.vd, data.x, data.x + start_vertex_id * k, m, n, k, eps2, stream); + + /** + * Restoring the input matrix after normalization. + */ + raft::linalg::matrixVectorOp( + const_cast(data.x), + data.x, + rowNorms.data(), + k, + m, + true, + true, + [] __device__(value_t mat_in, value_t vec_in) { return mat_in * vec_in; }, + stream); + } + + else { + eps2 = data.eps * data.eps; + + raft::spatial::knn::epsUnexpL2SqNeighborhood( + data.adj, data.vd, data.x, data.x + start_vertex_id * k, m, n, k, eps2, stream); + } } } // namespace Algo diff --git a/cpp/src/dbscan/vertexdeg/runner.cuh b/cpp/src/dbscan/vertexdeg/runner.cuh index 082a2ac46f..561c98ab12 100644 --- a/cpp/src/dbscan/vertexdeg/runner.cuh +++ b/cpp/src/dbscan/vertexdeg/runner.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,13 +36,14 @@ void run(const raft::handle_t& handle, int algo, Index_ start_vertex_id, Index_ batch_size, - cudaStream_t stream) + cudaStream_t stream, + raft::distance::DistanceType metric) { Pack data = {vd, adj, x, eps, N, D}; switch (algo) { case 0: Naive::launcher(data, start_vertex_id, batch_size, stream); break; case 1: - Algo::launcher(handle, data, start_vertex_id, batch_size, stream); + Algo::launcher(handle, data, start_vertex_id, batch_size, stream, metric); break; case 2: Precomputed::launcher(handle, data, start_vertex_id, batch_size, stream); diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index d00df0a822..255ed01bf5 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -147,10 +147,13 @@ class DBSCAN(Base, min_samples : int (default = 5) The number of samples in a neighborhood such that this group can be considered as an important core point (including the point itself). - metric: {'euclidean', 'precomputed'}, default = 'euclidean' + metric: {'euclidean', 'cosine', 'precomputed'}, default = 'euclidean' The metric to use when calculating distances between points. If metric is 'precomputed', X is assumed to be a distance matrix and must be square. + The input will be modified temporarily when cosine distance is used + and the restored input matrix might not match completely + due to numerical rounding. verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. @@ -266,7 +269,8 @@ class DBSCAN(Base, metric_parsing = { "L2": DistanceType.L2SqrtUnexpanded, "euclidean": DistanceType.L2SqrtUnexpanded, - "precomputed": DistanceType.Precomputed, + "cosine": DistanceType.CosineExpanded, + "precomputed": DistanceType.Precomputed } if self.metric in metric_parsing: metric = metric_parsing[self.metric.lower()] diff --git a/python/cuml/tests/test_dbscan.py b/python/cuml/tests/test_dbscan.py index 8c8027d7ec..e2cb8f27b2 100644 --- a/python/cuml/tests/test_dbscan.py +++ b/python/cuml/tests/test_dbscan.py @@ -116,6 +116,38 @@ def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype): cuml_dbscan.core_sample_indices_, eps) +@pytest.mark.parametrize('max_mbytes_per_batch', [unit_param(1), + quality_param(1e2), stress_param(None)]) +@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), + stress_param(10000)]) +@pytest.mark.parametrize('out_dtype', ["int32", "int64"]) +def test_dbscan_cosine(nrows, max_mbytes_per_batch, out_dtype): + # 2-dimensional dataset for easy distance matrix computation + X, y = make_blobs(n_samples=nrows, cluster_std=0.01, + n_features=2, random_state=0) + + eps = 0.1 + + cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, metric='cosine', + max_mbytes_per_batch=max_mbytes_per_batch, + output_type='numpy') + + cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) + + sk_dbscan = skDBSCAN(eps=eps, min_samples=5, metric='cosine', + algorithm='brute') + + sk_labels = sk_dbscan.fit_predict(X) + + # Check the core points are equal + assert array_equal(cuml_dbscan.core_sample_indices_, + sk_dbscan.core_sample_indices_) + + # Check the labels are correct + assert_dbscan_equal(sk_labels, cu_labels, X, + cuml_dbscan.core_sample_indices_, eps) + + @pytest.mark.parametrize("name", [ 'noisy_moons', 'blobs',