diff --git a/python/cuml/metrics/cluster/silhouette_score.pyx b/python/cuml/metrics/cluster/silhouette_score.pyx index 727da2e932..3cb9ba5fa7 100644 --- a/python/cuml/metrics/cluster/silhouette_score.pyx +++ b/python/cuml/metrics/cluster/silhouette_score.pyx @@ -24,6 +24,7 @@ from cuml.metrics.pairwise_distances import _determine_metric from cuml.raft.common.handle cimport handle_t from cuml.raft.common.handle import Handle from cuml.metrics.distance_type cimport DistanceType +from cuml.prims.label.classlabels import make_monotonic, check_labels cdef extern from "cuml/metrics/metrics.hpp" namespace "ML::Metrics::Batched": float silhouette_score( @@ -105,6 +106,16 @@ def _silhouette_coeff( labels.to_output(output_type='cupy', output_dtype='int') ).shape[0] + if not check_labels(labels, cp.arange(n_labels, dtype=np.int32)): + mono_labels, _ = make_monotonic(labels, copy=True) + mono_labels, _, _, _ = input_to_cuml_array( + mono_labels, + order='C', + convert_to_dtype=np.int32 + ) + else: + mono_labels = labels + cdef uintptr_t scores_ptr if sil_scores is None: scores_ptr = NULL @@ -122,7 +133,7 @@ def _silhouette_coeff( data.ptr, n_rows, n_cols, - labels.ptr, + mono_labels.ptr, n_labels, scores_ptr, chunksize, @@ -132,7 +143,7 @@ def _silhouette_coeff( data.ptr, n_rows, n_cols, - labels.ptr, + mono_labels.ptr, n_labels, scores_ptr, chunksize, diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py index adc17caa6e..8e3d3a16a2 100644 --- a/python/cuml/test/test_metrics.py +++ b/python/cuml/test/test_metrics.py @@ -270,6 +270,23 @@ def test_silhouette_samples_batched(metric, chunk_divider, labeled_clusters): assert False +def test_silhouette_score_batched_non_monotonic(): + vecs = np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], + [2.0, 2.0, 2.0], [10.0, 10.0, 10.0]]) + labels = np.array([0, 0, 1, 3]) + + cuml_score = cu_silhouette_score(X=vecs, labels=labels) + sk_score = sk_silhouette_score(X=vecs, labels=labels) + assert_almost_equal(cuml_score, sk_score, decimal=2) + + vecs = np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [10.0, 10.0, 10.0]]) + labels = np.array([1, 1, 3]) + + cuml_score = cu_silhouette_score(X=vecs, labels=labels) + sk_score = sk_silhouette_score(X=vecs, labels=labels) + assert_almost_equal(cuml_score, sk_score, decimal=2) + + def score_homogeneity(ground_truth, predictions, use_handle): return score_labeling_with_handle(cuml.metrics.homogeneity_score, ground_truth,