From 8566161f7de7ddaf78a8987ef71d8354fe8b903a Mon Sep 17 00:00:00 2001 From: aamijar Date: Sat, 1 Jun 2024 05:29:14 +0000 Subject: [PATCH 1/8] enable sparse dice metric in umap --- python/cuml/manifold/umap_utils.pyx | 5 ++++- python/cuml/tests/test_umap.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cuml/manifold/umap_utils.pyx b/python/cuml/manifold/umap_utils.pyx index 200b8cc4b3..ff41a38ebf 100644 --- a/python/cuml/manifold/umap_utils.pyx +++ b/python/cuml/manifold/umap_utils.pyx @@ -150,7 +150,8 @@ metric_parsing = { "hellinger": DistanceType.HellingerExpanded, "hamming": DistanceType.HammingUnexpanded, "jaccard": DistanceType.JaccardExpanded, - "canberra": DistanceType.Canberra + "canberra": DistanceType.Canberra, + "dice": DistanceType.DiceExpanded, } @@ -166,6 +167,7 @@ DENSE_SUPPORTED_METRICS = [ DistanceType.L2Unexpanded, DistanceType.Linf, DistanceType.LpUnexpanded, + DistanceType.DiceExpanded, ] @@ -181,4 +183,5 @@ SPARSE_SUPPORTED_METRICS = [ DistanceType.L2Unexpanded, DistanceType.Linf, DistanceType.LpUnexpanded, + DistanceType.DiceExpanded, ] diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py index 6faa4ad8d3..e9c5a11a62 100644 --- a/python/cuml/tests/test_umap.py +++ b/python/cuml/tests/test_umap.py @@ -676,6 +676,7 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): @pytest.mark.parametrize( "metric,supported", [ + ("dice", True), # not supported for dense yet in raft ("l2", True), ("euclidean", True), ("sqeuclidean", True), @@ -729,6 +730,7 @@ def test_umap_distance_metrics_fit_transform_trust(metric, supported): @pytest.mark.parametrize( "metric,supported,umap_learn_supported", [ + ("dice", True, True), ("l2", True, False), ("euclidean", True, True), ("sqeuclidean", True, False), From 08e2a170037fe2a37aec010a808e6798821f303e Mon Sep 17 00:00:00 2001 From: aamijar Date: Tue, 18 Jun 2024 23:38:46 +0000 Subject: [PATCH 2/8] dice metric for pairwise_distances api --- cpp/CMakeLists.txt | 1 + cpp/src/metrics/pairwise_distance.cu | 7 +++ cpp/src/metrics/pairwise_distance_dice.cu | 59 ++++++++++++++++++++++ cpp/src/metrics/pairwise_distance_dice.cuh | 46 +++++++++++++++++ python/cuml/metrics/pairwise_distances.pyx | 9 ++-- python/cuml/tests/test_umap.py | 4 +- 6 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 cpp/src/metrics/pairwise_distance_dice.cu create mode 100644 cpp/src/metrics/pairwise_distance_dice.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4b9692beb2..70e13d21d3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -444,6 +444,7 @@ if(BUILD_CUML_CPP_LIBRARY) src/metrics/pairwise_distance_chebyshev.cu src/metrics/pairwise_distance_correlation.cu src/metrics/pairwise_distance_cosine.cu + src/metrics/pairwise_distance_dice.cu src/metrics/pairwise_distance_euclidean.cu src/metrics/pairwise_distance_hamming.cu src/metrics/pairwise_distance_hellinger.cu diff --git a/cpp/src/metrics/pairwise_distance.cu b/cpp/src/metrics/pairwise_distance.cu index 4cb9fb60d1..479e862d8b 100644 --- a/cpp/src/metrics/pairwise_distance.cu +++ b/cpp/src/metrics/pairwise_distance.cu @@ -19,6 +19,7 @@ #include "pairwise_distance_chebyshev.cuh" #include "pairwise_distance_correlation.cuh" #include "pairwise_distance_cosine.cuh" +#include "pairwise_distance_dice.cuh" #include "pairwise_distance_euclidean.cuh" #include "pairwise_distance_hamming.cuh" #include "pairwise_distance_hellinger.cuh" @@ -88,6 +89,9 @@ void pairwise_distance(const raft::handle_t& handle, case raft::distance::DistanceType::RusselRaoExpanded: pairwise_distance_russell_rao(handle, x, y, dist, m, n, k, isRowMajor, metric_arg); break; + case raft::distance::DistanceType::DiceExpanded: + pairwise_distance_dice(handle, x, y, dist, m, n, k, isRowMajor, metric_arg); + break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } @@ -143,6 +147,9 @@ void pairwise_distance(const raft::handle_t& handle, case raft::distance::DistanceType::RusselRaoExpanded: pairwise_distance_russell_rao(handle, x, y, dist, m, n, k, isRowMajor, metric_arg); break; + case raft::distance::DistanceType::DiceExpanded: + pairwise_distance_dice(handle, x, y, dist, m, n, k, isRowMajor, metric_arg); + break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } diff --git a/cpp/src/metrics/pairwise_distance_dice.cu b/cpp/src/metrics/pairwise_distance_dice.cu new file mode 100644 index 0000000000..bc90d26c8d --- /dev/null +++ b/cpp/src/metrics/pairwise_distance_dice.cu @@ -0,0 +1,59 @@ + +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pairwise_distance_dice.cuh" + +#include +#include + +#include + +namespace ML { + +namespace Metrics { +void pairwise_distance_dice(const raft::handle_t& handle, + const double* x, + const double* y, + double* dist, + int m, + int n, + int k, + bool isRowMajor, + double metric_arg) +{ + // Call the distance function + raft::distance::distance( + handle, x, y, dist, m, n, k, isRowMajor); +} + +void pairwise_distance_dice(const raft::handle_t& handle, + const float* x, + const float* y, + float* dist, + int m, + int n, + int k, + bool isRowMajor, + float metric_arg) +{ + // Call the distance function + raft::distance::distance( + handle, x, y, dist, m, n, k, isRowMajor); +} + +} // namespace Metrics +} // namespace ML diff --git a/cpp/src/metrics/pairwise_distance_dice.cuh b/cpp/src/metrics/pairwise_distance_dice.cuh new file mode 100644 index 0000000000..cd5b8a247f --- /dev/null +++ b/cpp/src/metrics/pairwise_distance_dice.cuh @@ -0,0 +1,46 @@ + +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace ML { + +namespace Metrics { +void pairwise_distance_dice(const raft::handle_t& handle, + const double* x, + const double* y, + double* dist, + int m, + int n, + int k, + bool isRowMajor, + double metric_arg); + +void pairwise_distance_dice(const raft::handle_t& handle, + const float* x, + const float* y, + float* dist, + int m, + int n, + int k, + bool isRowMajor, + float metric_arg); + +} // namespace Metrics +} // namespace ML diff --git a/python/cuml/metrics/pairwise_distances.pyx b/python/cuml/metrics/pairwise_distances.pyx index c7d9952a8c..d17cb415cc 100644 --- a/python/cuml/metrics/pairwise_distances.pyx +++ b/python/cuml/metrics/pairwise_distances.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,7 +80,8 @@ PAIRWISE_DISTANCE_METRICS = { "hamming": DistanceType.HammingUnexpanded, "kldivergence": DistanceType.KLDivergence, "russellrao": DistanceType.RusselRaoExpanded, - "nan_euclidean": DistanceType.L2Expanded + "nan_euclidean": DistanceType.L2Expanded, + "dice": DistanceType.DiceExpanded } PAIRWISE_DISTANCE_SPARSE_METRICS = { @@ -344,7 +345,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if metric in ['nan_euclidean']: return nan_euclidean_distances(X, Y, **kwds) - if metric in ['russellrao'] and not np.all(X.data == 1.): + if metric in ['russellrao', 'dice'] and not np.all(X.data == 1.): warnings.warn("X was converted to boolean for metric {}" .format(metric)) X = np.where(X != 0., 1.0, 0.0) @@ -367,7 +368,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if (n_samples_x == 1 or n_features_x == 1): input_order = "K" - if metric in ['russellrao'] and not np.all(Y.data == 1.): + if metric in ['russellrao', 'dice'] and not np.all(Y.data == 1.): warnings.warn("Y was converted to boolean for metric {}" .format(metric)) Y = np.where(Y != 0., 1.0, 0.0) diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py index e9c5a11a62..26ab1cb78b 100644 --- a/python/cuml/tests/test_umap.py +++ b/python/cuml/tests/test_umap.py @@ -676,7 +676,6 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): @pytest.mark.parametrize( "metric,supported", [ - ("dice", True), # not supported for dense yet in raft ("l2", True), ("euclidean", True), ("sqeuclidean", True), @@ -686,6 +685,7 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): ("chebyshev", True), ("cosine", True), ("correlation", True), + ("dice", True), ("jaccard", False), ("hamming", True), ("canberra", True), @@ -730,7 +730,6 @@ def test_umap_distance_metrics_fit_transform_trust(metric, supported): @pytest.mark.parametrize( "metric,supported,umap_learn_supported", [ - ("dice", True, True), ("l2", True, False), ("euclidean", True, True), ("sqeuclidean", True, False), @@ -740,6 +739,7 @@ def test_umap_distance_metrics_fit_transform_trust(metric, supported): ("chebyshev", True, True), ("cosine", True, True), ("correlation", True, True), + ("dice", True, True), ("jaccard", True, True), ("hamming", True, True), ("canberra", True, True), From 99075ddae5ed0aa8008be8e3f57fd99f100f4d71 Mon Sep 17 00:00:00 2001 From: aamijar Date: Mon, 24 Jun 2024 22:31:40 +0000 Subject: [PATCH 3/8] remove comments --- cpp/src/metrics/pairwise_distance_dice.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/metrics/pairwise_distance_dice.cu b/cpp/src/metrics/pairwise_distance_dice.cu index bc90d26c8d..7aaabf07f3 100644 --- a/cpp/src/metrics/pairwise_distance_dice.cu +++ b/cpp/src/metrics/pairwise_distance_dice.cu @@ -35,7 +35,6 @@ void pairwise_distance_dice(const raft::handle_t& handle, bool isRowMajor, double metric_arg) { - // Call the distance function raft::distance::distance( handle, x, y, dist, m, n, k, isRowMajor); } @@ -50,7 +49,6 @@ void pairwise_distance_dice(const raft::handle_t& handle, bool isRowMajor, float metric_arg) { - // Call the distance function raft::distance::distance( handle, x, y, dist, m, n, k, isRowMajor); } From 6b046f756007cd8b53dede6d678be2825a70cab2 Mon Sep 17 00:00:00 2001 From: aamijar Date: Mon, 24 Jun 2024 22:47:00 +0000 Subject: [PATCH 4/8] test ci with raft fork --- cpp/cmake/thirdparty/get_raft.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index 7bc860eed8..d2555255cb 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -82,8 +82,8 @@ endfunction() # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} - FORK rapidsai - PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} + FORK aamijar + PINNED_TAG dice-distance-dense-inputs EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} # When PINNED_TAG above doesn't match cuml, # force local raft clone in build directory From 5a4861ae7d1824c3ed547a8bd0a2d4ce44f86f76 Mon Sep 17 00:00:00 2001 From: aamijar Date: Tue, 25 Jun 2024 16:47:29 +0000 Subject: [PATCH 5/8] remove test ci --- cpp/cmake/thirdparty/get_raft.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index d2555255cb..7bc860eed8 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -82,8 +82,8 @@ endfunction() # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} - FORK aamijar - PINNED_TAG dice-distance-dense-inputs + FORK rapidsai + PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} # When PINNED_TAG above doesn't match cuml, # force local raft clone in build directory From 88278d34879d061ceb58f027c366942444d1c415 Mon Sep 17 00:00:00 2001 From: aamijar Date: Tue, 25 Jun 2024 21:23:16 +0000 Subject: [PATCH 6/8] refactor --- cpp/src/metrics/pairwise_distance_dice.cu | 3 +-- cpp/src/metrics/pairwise_distance_dice.cuh | 3 +-- python/cuml/manifold/umap_utils.pyx | 28 +++++++++++----------- python/cuml/metrics/pairwise_distances.pyx | 8 +++---- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/cpp/src/metrics/pairwise_distance_dice.cu b/cpp/src/metrics/pairwise_distance_dice.cu index 7aaabf07f3..eb84dbece4 100644 --- a/cpp/src/metrics/pairwise_distance_dice.cu +++ b/cpp/src/metrics/pairwise_distance_dice.cu @@ -1,6 +1,5 @@ - /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/metrics/pairwise_distance_dice.cuh b/cpp/src/metrics/pairwise_distance_dice.cuh index cd5b8a247f..6e6351c02d 100644 --- a/cpp/src/metrics/pairwise_distance_dice.cuh +++ b/cpp/src/metrics/pairwise_distance_dice.cuh @@ -1,6 +1,5 @@ - /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cuml/manifold/umap_utils.pyx b/python/cuml/manifold/umap_utils.pyx index ff41a38ebf..237829ea00 100644 --- a/python/cuml/manifold/umap_utils.pyx +++ b/python/cuml/manifold/umap_utils.pyx @@ -135,23 +135,23 @@ def find_ab_params(spread, min_dist): metric_parsing = { - "l2": DistanceType.L2SqrtUnexpanded, - "euclidean": DistanceType.L2SqrtUnexpanded, - "sqeuclidean": DistanceType.L2Unexpanded, - "cityblock": DistanceType.L1, - "l1": DistanceType.L1, - "manhattan": DistanceType.L1, - "taxicab": DistanceType.L1, - "minkowski": DistanceType.LpUnexpanded, + "canberra": DistanceType.Canberra, "chebyshev": DistanceType.Linf, - "linf": DistanceType.Linf, + "cityblock": DistanceType.L1, "cosine": DistanceType.CosineExpanded, "correlation": DistanceType.CorrelationExpanded, - "hellinger": DistanceType.HellingerExpanded, + "dice": DistanceType.DiceExpanded, + "euclidean": DistanceType.L2SqrtUnexpanded, "hamming": DistanceType.HammingUnexpanded, + "hellinger": DistanceType.HellingerExpanded, "jaccard": DistanceType.JaccardExpanded, - "canberra": DistanceType.Canberra, - "dice": DistanceType.DiceExpanded, + "l1": DistanceType.L1, + "l2": DistanceType.L2SqrtUnexpanded, + "linf": DistanceType.Linf, + "manhattan": DistanceType.L1, + "minkowski": DistanceType.LpUnexpanded, + "sqeuclidean": DistanceType.L2Unexpanded, + "taxicab": DistanceType.L1, } @@ -159,6 +159,7 @@ DENSE_SUPPORTED_METRICS = [ DistanceType.Canberra, DistanceType.CorrelationExpanded, DistanceType.CosineExpanded, + DistanceType.DiceExpanded, DistanceType.HammingUnexpanded, DistanceType.HellingerExpanded, # DistanceType.JaccardExpanded, # not supported @@ -167,7 +168,6 @@ DENSE_SUPPORTED_METRICS = [ DistanceType.L2Unexpanded, DistanceType.Linf, DistanceType.LpUnexpanded, - DistanceType.DiceExpanded, ] @@ -175,6 +175,7 @@ SPARSE_SUPPORTED_METRICS = [ DistanceType.Canberra, DistanceType.CorrelationExpanded, DistanceType.CosineExpanded, + DistanceType.DiceExpanded, DistanceType.HammingUnexpanded, DistanceType.HellingerExpanded, DistanceType.JaccardExpanded, @@ -183,5 +184,4 @@ SPARSE_SUPPORTED_METRICS = [ DistanceType.L2Unexpanded, DistanceType.Linf, DistanceType.LpUnexpanded, - DistanceType.DiceExpanded, ] diff --git a/python/cuml/metrics/pairwise_distances.pyx b/python/cuml/metrics/pairwise_distances.pyx index d17cb415cc..269b1dbdc8 100644 --- a/python/cuml/metrics/pairwise_distances.pyx +++ b/python/cuml/metrics/pairwise_distances.pyx @@ -81,7 +81,7 @@ PAIRWISE_DISTANCE_METRICS = { "kldivergence": DistanceType.KLDivergence, "russellrao": DistanceType.RusselRaoExpanded, "nan_euclidean": DistanceType.L2Expanded, - "dice": DistanceType.DiceExpanded + "dice": DistanceType.DiceExpanded, } PAIRWISE_DISTANCE_SPARSE_METRICS = { @@ -98,7 +98,7 @@ PAIRWISE_DISTANCE_SPARSE_METRICS = { "jaccard": DistanceType.JaccardExpanded, "hellinger": DistanceType.HellingerExpanded, "chebyshev": DistanceType.Linf, - "dice": DistanceType.DiceExpanded + "dice": DistanceType.DiceExpanded, } @@ -345,7 +345,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if metric in ['nan_euclidean']: return nan_euclidean_distances(X, Y, **kwds) - if metric in ['russellrao', 'dice'] and not np.all(X.data == 1.): + if metric in {'russellrao', 'dice'} and not np.all(X.data == 1.): warnings.warn("X was converted to boolean for metric {}" .format(metric)) X = np.where(X != 0., 1.0, 0.0) @@ -368,7 +368,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if (n_samples_x == 1 or n_features_x == 1): input_order = "K" - if metric in ['russellrao', 'dice'] and not np.all(Y.data == 1.): + if metric in {'russellrao', 'dice'} and not np.all(Y.data == 1.): warnings.warn("Y was converted to boolean for metric {}" .format(metric)) Y = np.where(Y != 0., 1.0, 0.0) From 354e2eb21d3f2e45867fdf6567c177e42e177695 Mon Sep 17 00:00:00 2001 From: aamijar Date: Wed, 26 Jun 2024 18:51:47 +0000 Subject: [PATCH 7/8] rerun ci From 76b987f984589ea8ba69781456b4a737a0ddeafd Mon Sep 17 00:00:00 2001 From: aamijar Date: Fri, 28 Jun 2024 05:21:41 +0000 Subject: [PATCH 8/8] remove binarization --- python/cuml/metrics/pairwise_distances.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/metrics/pairwise_distances.pyx b/python/cuml/metrics/pairwise_distances.pyx index 269b1dbdc8..8409ce7dd9 100644 --- a/python/cuml/metrics/pairwise_distances.pyx +++ b/python/cuml/metrics/pairwise_distances.pyx @@ -345,7 +345,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if metric in ['nan_euclidean']: return nan_euclidean_distances(X, Y, **kwds) - if metric in {'russellrao', 'dice'} and not np.all(X.data == 1.): + if metric in {'russellrao'} and not np.all(X.data == 1.): warnings.warn("X was converted to boolean for metric {}" .format(metric)) X = np.where(X != 0., 1.0, 0.0) @@ -368,7 +368,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, if (n_samples_x == 1 or n_features_x == 1): input_order = "K" - if metric in {'russellrao', 'dice'} and not np.all(Y.data == 1.): + if metric in {'russellrao'} and not np.all(Y.data == 1.): warnings.warn("Y was converted to boolean for metric {}" .format(metric)) Y = np.where(Y != 0., 1.0, 0.0)