Skip to content

Commit

Permalink
Simplicial set functions (rapidsai#4756)
Browse files Browse the repository at this point in the history
Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Sevag Hanssian (https://github.com/sevagh)

URL: rapidsai#4756
  • Loading branch information
viclafargue authored May 31, 2022
1 parent bcac0ba commit 8d8eedb
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 25 deletions.
3 changes: 3 additions & 0 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ unset GIT_DESCRIBE_TAG
# ucx-py version
export UCX_PY_VERSION='0.26.*'

# configure numba threading library
export NUMBA_THREADING_LAYER=workqueue

################################################################################
# SETUP - Check environment
################################################################################
Expand Down
188 changes: 188 additions & 0 deletions python/cuml/tests/test_simpl_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest
from cuml.datasets import make_blobs
import numpy as np
import cupy as cp
import umap.distances as dist
from cuml.manifold.umap import UMAP
from cuml.neighbors import NearestNeighbors

from umap.umap_ import fuzzy_simplicial_set as ref_fuzzy_simplicial_set
from cuml.manifold.umap import fuzzy_simplicial_set \
as cu_fuzzy_simplicial_set
from umap.umap_ import simplicial_set_embedding as ref_simplicial_set_embedding
from cuml.manifold.umap import simplicial_set_embedding \
as cu_simplicial_set_embedding


def correctness_dense(a, b, rtol=0.1, threshold=0.95):
n_elms = a.size
n_correct = (cp.abs(a - b) <= (rtol * cp.abs(b))).sum()
correctness = n_correct / n_elms
return correctness >= threshold


def correctness_sparse(a, b, atol=0.1, rtol=0.2, threshold=0.95):
n_ref_zeros = (a == 0).sum()
n_ref_non_zero_elms = a.size - n_ref_zeros
n_correct = (cp.abs(a - b) <= (atol + rtol * cp.abs(b))).sum()
correctness = (n_correct - n_ref_zeros) / n_ref_non_zero_elms
return correctness >= threshold


@pytest.mark.parametrize('n_rows', [800, 5000])
@pytest.mark.parametrize('n_features', [8, 32])
@pytest.mark.parametrize('n_neighbors', [8, 16])
@pytest.mark.parametrize('precomputed_nearest_neighbors', [False, True])
def test_fuzzy_simplicial_set(n_rows,
n_features,
n_neighbors,
precomputed_nearest_neighbors):
n_clusters = 30
random_state = 42
metric = 'euclidean'

X, _ = make_blobs(n_samples=n_rows, centers=n_clusters,
n_features=n_features, random_state=random_state)

if precomputed_nearest_neighbors:
nn = NearestNeighbors(n_neighbors=n_neighbors,
metric=metric)
nn.fit(X)
knn_dists, knn_indices = nn.kneighbors(X,
n_neighbors,
return_distance=True)
cu_fss_graph = cu_fuzzy_simplicial_set(
X,
n_neighbors,
random_state,
metric,
knn_indices=knn_indices,
knn_dists=knn_dists)

knn_indices = knn_indices.get()
knn_dists = knn_dists.get()
ref_fss_graph = ref_fuzzy_simplicial_set(
X,
n_neighbors,
random_state,
metric,
knn_indices=knn_indices,
knn_dists=knn_dists)[0].tocoo()
else:
cu_fss_graph = cu_fuzzy_simplicial_set(
X,
n_neighbors,
random_state,
metric)

X = X.get()
ref_fss_graph = ref_fuzzy_simplicial_set(
X,
n_neighbors,
random_state,
metric)[0].tocoo()

cu_fss_graph = cu_fss_graph.todense()
ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
assert correctness_sparse(ref_fss_graph,
cu_fss_graph,
atol=0.1,
rtol=0.2,
threshold=0.95)


@pytest.mark.parametrize('n_rows', [800, 5000])
@pytest.mark.parametrize('n_features', [8, 32])
@pytest.mark.parametrize('n_neighbors', [8, 16])
@pytest.mark.parametrize('n_components', [2, 5])
def test_simplicial_set_embedding(n_rows,
n_features,
n_neighbors,
n_components):
n_clusters = 30
random_state = 42
metric = 'euclidean'
initial_alpha = 1.0
a, b = UMAP.find_ab_params(1.0, 0.1)
gamma = 0
negative_sample_rate = 5
n_epochs = 500
init = 'random'
metric = 'euclidean'
metric_kwds = {}
densmap = False
densmap_kwds = {}
output_dens = False
output_metric = 'euclidean'
output_metric_kwds = {}

X, _ = make_blobs(n_samples=n_rows, centers=n_clusters,
n_features=n_features, random_state=random_state)
X = X.get()

ref_fss_graph = ref_fuzzy_simplicial_set(X,
n_neighbors,
random_state,
metric)[0]
ref_embedding = ref_simplicial_set_embedding(
X,
ref_fss_graph,
n_components,
initial_alpha,
a,
b,
gamma,
negative_sample_rate,
n_epochs,
init,
np.random.RandomState(random_state),
dist.named_distances_with_gradients[metric],
metric_kwds,
densmap,
densmap_kwds,
output_dens,
output_metric=output_metric,
output_metric_kwds=output_metric_kwds)[0]

cu_fss_graph = cu_fuzzy_simplicial_set(X,
n_neighbors,
random_state,
metric)

cu_embedding = cu_simplicial_set_embedding(
X,
cu_fss_graph,
n_components,
initial_alpha,
a,
b,
gamma,
negative_sample_rate,
n_epochs,
init,
random_state,
metric,
metric_kwds,
output_metric=output_metric,
output_metric_kwds=output_metric_kwds)

ref_embedding = cp.array(ref_embedding)
assert correctness_dense(ref_embedding,
cu_embedding,
rtol=0.1,
threshold=0.95)
34 changes: 9 additions & 25 deletions python/cuml/tests/test_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,44 +541,28 @@ def correctness_sparse(a, b, atol=0.1, rtol=0.2, threshold=0.95):
return correctness >= threshold


@pytest.mark.parametrize('n_rows', [800, 5000])
@pytest.mark.parametrize('n_rows', [200, 800])
@pytest.mark.parametrize('n_features', [8, 32])
@pytest.mark.parametrize('n_neighbors', [8, 16])
@pytest.mark.parametrize('precomputed_nearest_neighbors', [False, True])
def test_fuzzy_simplicial_set(n_rows,
n_features,
n_neighbors,
precomputed_nearest_neighbors):
n_neighbors):
n_clusters = 30
random_state = 42
metric = 'euclidean'

X, _ = make_blobs(n_samples=n_rows, centers=n_clusters,
n_features=n_features, random_state=random_state)

if precomputed_nearest_neighbors:
nn = NearestNeighbors(n_neighbors=n_neighbors,
metric=metric)
nn.fit(X)
knn_dists, knn_indices = nn.kneighbors(X,
n_neighbors,
return_distance=True)
knn_graph = nn.kneighbors_graph(X, mode="distance")
model = cuUMAP(n_neighbors=n_neighbors)
model.fit(X,
knn_graph=knn_graph)
cu_fss_graph = model.graph_

knn_indices = knn_indices
knn_dists = knn_dists
model = cuUMAP(n_neighbors=n_neighbors)
model.fit(X)
cu_fss_graph = model.graph_

else:
model = cuUMAP(n_neighbors=n_neighbors)
model.fit(X)
cu_fss_graph = model.graph_
model = umap.UMAP(n_neighbors=n_neighbors)
model.fit(X)
ref_fss_graph = model.graph_

cu_fss_graph = cu_fss_graph.todense()
ref_fss_graph = cu_fss_graph
ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
assert correctness_sparse(ref_fss_graph,
cu_fss_graph,
atol=0.1,
Expand Down

0 comments on commit 8d8eedb

Please sign in to comment.