From 6bf61ca83d364094eb2a075b5ae9e9d5cdc9d42b Mon Sep 17 00:00:00 2001 From: Victor Lafargue Date: Thu, 3 Aug 2023 02:34:29 +0200 Subject: [PATCH] Fix UMAP and simplicial set functions metric (#5490) Answers #5422 Authors: - Victor Lafargue (https://github.com/viclafargue) - Dante Gama Dessavre (https://github.com/dantegd) - Simon Adorf (https://github.com/csadorf) Approvers: - Simon Adorf (https://github.com/csadorf) - William Hicks (https://github.com/wphicks) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuml/pull/5490 --- cpp/src/umap/knn_graph/algo.cuh | 3 + python/cuml/manifold/simpl_set.pyx | 49 +++++++++++++--- python/cuml/manifold/umap.pyx | 53 +++++++---------- python/cuml/manifold/umap_utils.pyx | 51 ++++++++++++++++ python/cuml/tests/test_umap.py | 91 ++++++++++++++++++----------- 5 files changed, 170 insertions(+), 77 deletions(-) diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh index bacf1a6a79..68d9dda1a3 100644 --- a/cpp/src/umap/knn_graph/algo.cuh +++ b/cpp/src/umap/knn_graph/algo.cuh @@ -71,6 +71,9 @@ inline void launcher(const raft::handle_t& handle, out.knn_indices, out.knn_dists, n_neighbors, + true, + true, + static_cast*>(nullptr), params->metric, params->p); } diff --git a/python/cuml/manifold/simpl_set.pyx b/python/cuml/manifold/simpl_set.pyx index a22f4da38a..d0f30e3e88 100644 --- a/python/cuml/manifold/simpl_set.pyx +++ b/python/cuml/manifold/simpl_set.pyx @@ -22,7 +22,8 @@ from cuml.internals.safe_imports import gpu_only_import cp = gpu_only_import('cupy') from cuml.manifold.umap_utils cimport * -from cuml.manifold.umap_utils import GraphHolder, find_ab_params +from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ + metric_parsing from cuml.internals.input_utils import input_to_cuml_array from cuml.internals.array import CumlArray @@ -82,10 +83,17 @@ def fuzzy_simplicial_set(X, structure to the detriment of the larger picture. random_state: numpy RandomState or equivalent A state capable being used as a numpy random state. - metric: string or function (optional, default 'euclidean') - unused - metric_kwds: dict (optional, default {}) - unused + metric: string (default='euclidean'). + Distance metric to use. Supported distances are ['l1, 'cityblock', + 'taxicab', 'manhattan', 'euclidean', 'l2', 'sqeuclidean', 'canberra', + 'minkowski', 'chebyshev', 'linf', 'cosine', 'correlation', 'hellinger', + 'hamming', 'jaccard'] + Metrics that take arguments (such as minkowski) can have arguments + passed via the metric_kwds dictionary. + Note: The 'jaccard' distance metric is only supported for sparse + inputs. + metric_kwds: dict (optional, default=None) + Metric argument knn_indices: array of shape (n_samples, n_neighbors) (optional) If the k-nearest neighbors of each point has already been calculated you can pass them in here to save computation time. This should be @@ -138,6 +146,14 @@ def fuzzy_simplicial_set(X, umap_params.deterministic = deterministic umap_params.set_op_mix_ratio = set_op_mix_ratio umap_params.local_connectivity = local_connectivity + try: + umap_params.metric = metric_parsing[metric.lower()] + except KeyError: + raise ValueError(f"Invalid value for metric: {metric}") + if metric_kwds is None: + umap_params.p = 2.0 + else: + umap_params.p = metric_kwds.get("p", 2.0) umap_params.verbosity = verbose X_m, _, _, _ = \ @@ -245,10 +261,17 @@ def simplicial_set_embedding( * A numpy array of initial embedding positions. random_state: numpy RandomState or equivalent A state capable being used as a numpy random state. - metric: string or callable - unused - metric_kwds: dict - unused + metric: string (default='euclidean'). + Distance metric to use. Supported distances are ['l1, 'cityblock', + 'taxicab', 'manhattan', 'euclidean', 'l2', 'sqeuclidean', 'canberra', + 'minkowski', 'chebyshev', 'linf', 'cosine', 'correlation', 'hellinger', + 'hamming', 'jaccard'] + Metrics that take arguments (such as minkowski) can have arguments + passed via the metric_kwds dictionary. + Note: The 'jaccard' distance metric is only supported for sparse + inputs. + metric_kwds: dict (optional, default=None) + Metric argument output_metric: function Function returning the distance between two points in embedding space and the gradient of the distance wrt the first argument. @@ -306,6 +329,14 @@ def simplicial_set_embedding( umap_params.init = 0 umap_params.random_state = random_state umap_params.deterministic = deterministic + try: + umap_params.metric = metric_parsing[metric.lower()] + except KeyError: + raise ValueError(f"Invalid value for metric: {metric}") + if metric_kwds is None: + umap_params.p = 2.0 + else: + umap_params.p = metric_kwds.get("p", 2.0) if output_metric == 'euclidean': umap_params.target_metric = MetricType.EUCLIDEAN else: # output_metric == 'categorical' diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index eb02258e2c..33082f4d4c 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -29,7 +29,8 @@ cupyx = gpu_only_import('cupyx') cuda = gpu_only_import('numba.cuda') from cuml.manifold.umap_utils cimport * -from cuml.manifold.umap_utils import GraphHolder, find_ab_params +from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ + metric_parsing, DENSE_SUPPORTED_METRICS, SPARSE_SUPPORTED_METRICS from cuml.common.sparsefuncs import extract_knn_infos from cuml.internals.safe_imports import gpu_only_import_from @@ -47,7 +48,6 @@ from cuml.internals.array import CumlArray from cuml.internals.array_sparse import SparseCumlArray from cuml.internals.mixins import CMajorInputTagMixin from cuml.common.sparse_utils import is_sparse -from cuml.metrics.distance_type cimport DistanceType from cuml.manifold.simpl_set import fuzzy_simplicial_set # no-cython-lint from cuml.manifold.simpl_set import simplicial_set_embedding # no-cython-lint @@ -152,13 +152,17 @@ class UMAP(UniversalBase, n_components: int (optional, default 2) The dimension of the space to embed into. This defaults to 2 to provide easy visualization, but can reasonably be set to any - metric : string (default='euclidean'). + metric: string (default='euclidean'). Distance metric to use. Supported distances are ['l1, 'cityblock', 'taxicab', 'manhattan', 'euclidean', 'l2', 'sqeuclidean', 'canberra', 'minkowski', 'chebyshev', 'linf', 'cosine', 'correlation', 'hellinger', 'hamming', 'jaccard'] Metrics that take arguments (such as minkowski) can have arguments passed via the metric_kwds dictionary. + Note: The 'jaccard' distance metric is only supported for sparse + inputs. + metric_kwds: dict (optional, default=None) + Metric argument n_epochs: int (optional, default None) The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate @@ -419,7 +423,7 @@ class UMAP(UniversalBase, raise ValueError("min_dist should be <= spread") @staticmethod - def _build_umap_params(cls): + def _build_umap_params(cls, sparse): cdef UMAPParams* umap_params = new UMAPParams() umap_params.n_neighbors = cls.n_neighbors umap_params.n_components = cls.n_components @@ -448,37 +452,20 @@ class UMAP(UniversalBase, umap_params.random_state = cls.random_state umap_params.deterministic = cls.deterministic - # metric - metric_parsing = { - "l2": DistanceType.L2SqrtUnexpanded, - "euclidean": DistanceType.L2SqrtUnexpanded, - "sqeuclidean": DistanceType.L2Unexpanded, - "cityblock": DistanceType.L1, - "l1": DistanceType.L1, - "manhattan": DistanceType.L1, - "taxicab": DistanceType.L1, - "minkowski": DistanceType.LpUnexpanded, - "chebyshev": DistanceType.Linf, - "linf": DistanceType.Linf, - "cosine": DistanceType.CosineExpanded, - "correlation": DistanceType.CorrelationExpanded, - "hellinger": DistanceType.HellingerExpanded, - "hamming": DistanceType.HammingUnexpanded, - "jaccard": DistanceType.JaccardExpanded, - "canberra": DistanceType.Canberra - } - - if cls.metric.lower() in metric_parsing: + try: umap_params.metric = metric_parsing[cls.metric.lower()] - else: - raise ValueError("Invalid value for metric: {}" - .format(cls.metric)) - + if sparse: + if umap_params.metric not in SPARSE_SUPPORTED_METRICS: + raise NotImplementedError(f"Metric '{cls.metric}' not supported for sparse inputs.") + elif umap_params.metric not in DENSE_SUPPORTED_METRICS: + raise NotImplementedError(f"Metric '{cls.metric}' not supported for dense inputs.") + + except KeyError: + raise ValueError(f"Invalid value for metric: {cls.metric}") if cls.metric_kwds is None: umap_params.p = 2.0 else: - umap_params.p = cls.metric_kwds.get('p') - + umap_params.p = cls.metric_kwds.get("p", 2.0) cdef uintptr_t callback_ptr = 0 if cls.callback: callback_ptr = cls.callback.get_native_callback() @@ -576,7 +563,7 @@ class UMAP(UniversalBase, cdef uintptr_t embed_raw = self.embedding_.ptr cdef UMAPParams* umap_params = \ - UMAP._build_umap_params(self) + UMAP._build_umap_params(self, self.sparse_fit) cdef uintptr_t y_raw = 0 @@ -742,7 +729,7 @@ class UMAP(UniversalBase, cdef uintptr_t embed_ptr = self.embedding_.ptr cdef UMAPParams* umap_params = \ - UMAP._build_umap_params(self) + UMAP._build_umap_params(self, self.sparse_fit) if self.sparse_fit: transform_sparse(handle_[0], diff --git a/python/cuml/manifold/umap_utils.pyx b/python/cuml/manifold/umap_utils.pyx index 5d69ead34c..df89c0fe1f 100644 --- a/python/cuml/manifold/umap_utils.pyx +++ b/python/cuml/manifold/umap_utils.pyx @@ -19,6 +19,7 @@ from rmm._lib.memory_resource cimport get_current_device_resource from pylibraft.common.handle cimport handle_t from cuml.manifold.umap_utils cimport * +from cuml.metrics.distance_type cimport DistanceType from libcpp.utility cimport move from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') @@ -130,3 +131,53 @@ def find_ab_params(spread, min_dist): yv[xv >= min_dist] = np.exp(-(xv[xv >= min_dist] - min_dist) / spread) params, _ = curve_fit(curve, xv, yv) return params[0], params[1] + + +metric_parsing = { + "l2": DistanceType.L2SqrtUnexpanded, + "euclidean": DistanceType.L2SqrtUnexpanded, + "sqeuclidean": DistanceType.L2Unexpanded, + "cityblock": DistanceType.L1, + "l1": DistanceType.L1, + "manhattan": DistanceType.L1, + "taxicab": DistanceType.L1, + "minkowski": DistanceType.LpUnexpanded, + "chebyshev": DistanceType.Linf, + "linf": DistanceType.Linf, + "cosine": DistanceType.CosineExpanded, + "correlation": DistanceType.CorrelationExpanded, + "hellinger": DistanceType.HellingerExpanded, + "hamming": DistanceType.HammingUnexpanded, + "jaccard": DistanceType.JaccardExpanded, + "canberra": DistanceType.Canberra +} + + +DENSE_SUPPORTED_METRICS = [ + DistanceType.Canberra, + DistanceType.CorrelationExpanded, + DistanceType.CosineExpanded, + DistanceType.HammingUnexpanded, + DistanceType.HellingerExpanded, + # DistanceType.JaccardExpanded, # not supported + DistanceType.L1, + DistanceType.L2SqrtUnexpanded, + DistanceType.L2Unexpanded, + DistanceType.Linf, + DistanceType.LpUnexpanded, +] + + +SPARSE_SUPPORTED_METRICS = [ + DistanceType.Canberra, + DistanceType.CorrelationExpanded, + DistanceType.CosineExpanded, + DistanceType.HammingUnexpanded, + DistanceType.HellingerExpanded, + DistanceType.JaccardExpanded, + DistanceType.L1, + DistanceType.L2SqrtUnexpanded, + DistanceType.L2Unexpanded, + DistanceType.Linf, + DistanceType.LpUnexpanded, +] diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py index 479853dc07..34d899e7bf 100644 --- a/python/cuml/tests/test_umap.py +++ b/python/cuml/tests/test_umap.py @@ -674,26 +674,26 @@ def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): @pytest.mark.parametrize( - "metric", + "metric,supported", [ - "l2", - "euclidean", - "sqeuclidean", - "l1", - "manhattan", - "minkowski", - "chebyshev", - "cosine", - "correlation", - "jaccard", - "hamming", - "canberra", + ("l2", True), + ("euclidean", True), + ("sqeuclidean", True), + ("l1", True), + ("manhattan", True), + ("minkowski", True), + ("chebyshev", True), + ("cosine", True), + ("correlation", True), + ("jaccard", False), + ("hamming", True), + ("canberra", True), ], ) @pytest.mark.skipif( IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" ) -def test_umap_distance_metrics_fit_transform_trust(metric): +def test_umap_distance_metrics_fit_transform_trust(metric, supported): data, labels = make_blobs( n_samples=1000, n_features=64, centers=5, random_state=42 ) @@ -707,7 +707,13 @@ def test_umap_distance_metrics_fit_transform_trust(metric): cuml_model = cuUMAP( n_neighbors=10, min_dist=0.01, metric=metric, init="random" ) + if not supported: + with pytest.raises(NotImplementedError): + cuml_model.fit_transform(data) + return + umap_embedding = umap_model.fit_transform(data) + cuml_embedding = cuml_model.fit_transform(data) umap_trust = trustworthiness( @@ -721,24 +727,28 @@ def test_umap_distance_metrics_fit_transform_trust(metric): @pytest.mark.parametrize( - "metric", + "metric,supported,umap_learn_supported", [ - "euclidean", - "l1", - "manhattan", - "minkowski", - "chebyshev", - "cosine", - "correlation", - "jaccard", - "hamming", - "canberra", + ("l2", True, False), + ("euclidean", True, True), + ("sqeuclidean", True, False), + ("l1", True, True), + ("manhattan", True, True), + ("minkowski", True, True), + ("chebyshev", True, True), + ("cosine", True, True), + ("correlation", True, True), + ("jaccard", True, True), + ("hamming", True, True), + ("canberra", True, True), ], ) @pytest.mark.skipif( IS_ARM, reason="https://github.com/rapidsai/cuml/issues/5441" ) -def test_umap_distance_metrics_fit_transform_trust_on_sparse_input(metric): +def test_umap_distance_metrics_fit_transform_trust_on_sparse_input( + metric, supported, umap_learn_supported +): data, labels = make_blobs( n_samples=1000, n_features=64, centers=5, random_state=42 ) @@ -752,20 +762,31 @@ def test_umap_distance_metrics_fit_transform_trust_on_sparse_input(metric): new_data = scipy_sparse.csr_matrix(data[~data_selection]) - umap_model = umap.UMAP( - n_neighbors=10, min_dist=0.01, metric=metric, init="random" - ) + if umap_learn_supported: + umap_model = umap.UMAP( + n_neighbors=10, min_dist=0.01, metric=metric, init="random" + ) + umap_embedding = umap_model.fit_transform(new_data) + umap_trust = trustworthiness( + data[~data_selection], + umap_embedding, + n_neighbors=10, + metric=metric, + ) + cuml_model = cuUMAP( n_neighbors=10, min_dist=0.01, metric=metric, init="random" ) - umap_embedding = umap_model.fit_transform(new_data) - cuml_embedding = cuml_model.fit_transform(new_data) - umap_trust = trustworthiness( - data[~data_selection], umap_embedding, n_neighbors=10, metric=metric - ) + if not supported: + with pytest.raises(NotImplementedError): + cuml_model.fit_transform(new_data) + return + + cuml_embedding = cuml_model.fit_transform(new_data) cuml_trust = trustworthiness( data[~data_selection], cuml_embedding, n_neighbors=10, metric=metric ) - assert array_equal(umap_trust, cuml_trust, 0.05, with_sign=True) + if umap_learn_supported: + assert array_equal(umap_trust, cuml_trust, 0.05, with_sign=True)