Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random ball cover algorithm for 3D data #4582

Merged
merged 31 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
acd03f1
Adding support for rbc in 3d
cjnolet Feb 15, 2022
427dbd0
Setting raft pin
cjnolet Feb 15, 2022
1eec541
Fixing style
cjnolet Feb 15, 2022
ea8f9fe
Removing unecessary code
cjnolet Feb 15, 2022
daaf8dd
Merge remote-tracking branch 'rapidsai/branch-22.04' into fea-2204-rb…
cjnolet Feb 16, 2022
d860150
Updating copyright
cjnolet Feb 16, 2022
d1ea752
Merge remote-tracking branch 'rapidsai/branch-22.04' into fea-2204-rb…
cjnolet Feb 23, 2022
294d815
Using static linking when internal raft clone is used
cjnolet Feb 24, 2022
0191abe
RAFT qualify environment var
cjnolet Feb 24, 2022
ad91e41
Adding -fPIC to build flags
cjnolet Feb 24, 2022
7b92b25
Updating pin
cjnolet Feb 26, 2022
f8c8406
Fixing destructor of neighbors cython so subclasses can call it.
cjnolet Feb 26, 2022
e9aed72
Updating knn regressor and classifier
cjnolet Feb 27, 2022
afab91c
nearest neighbors python to fall back to brute force when rbc shouldn't
cjnolet Feb 27, 2022
2ccf80e
Merge remote-tracking branch 'rapidsai/branch-22.04' into fea-2204-rb…
cjnolet Feb 27, 2022
fe9bfd4
Updating copyrights for python
cjnolet Feb 27, 2022
abafc35
Reverting changes to test_umap
cjnolet Feb 27, 2022
3cb96c0
Fixing style
cjnolet Feb 27, 2022
b5a900c
Merge branch 'branch-22.04' into fea-2204-rbc_3d
cjnolet Mar 1, 2022
0088e4a
Merge remote-tracking branch 'rapidsai/branch-22.04' into fea-2204-rb…
cjnolet Mar 17, 2022
c12d130
Merge branch 'branch-22.04' into fea-2204-rbc_3d
cjnolet Mar 28, 2022
581234c
Merge remote-tracking branch 'rapidsai/branch-22.06' into fea-2204-rb…
cjnolet Apr 28, 2022
9b5c3ee
Turning off static linking
cjnolet Apr 28, 2022
a7d363e
Not setting static link libraries to on
cjnolet Apr 28, 2022
1564a04
Fixing duplicate rng import
cjnolet Apr 28, 2022
cee6787
Merge remote-tracking branch 'rapidsai/branch-22.06' into fea-2204-rb…
cjnolet May 3, 2022
e5d016e
Allowing a couple mismatched indices just in the case of
cjnolet May 4, 2022
ad4e472
Reverting change
cjnolet May 13, 2022
7329c1c
Merge remote-tracking branch 'rapidsai/branch-22.06' into fea-2204-rb…
cjnolet May 13, 2022
589a9c4
Reverting umap changes
cjnolet May 13, 2022
aced187
Review feedback
cjnolet May 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cpp/src/randomforest/randomforest.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@

#include <metrics/scores.cuh>
#include <raft/random/permute.hpp>
#include <raft/random/rng.hpp>
#include <raft/random/rng.cuh>

#include <raft/cudart_utils.h>
#include <raft/random/rng.hpp>

#ifdef _OPENMP
#include <omp.h>
Expand Down
33 changes: 24 additions & 9 deletions python/cuml/neighbors/nearest_neighbors.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import cupyx
import cudf
import ctypes
import warnings
import math

import cuml.internals
from cuml.common.base import Base
Expand Down Expand Up @@ -178,7 +179,7 @@ class NearestNeighbors(Base,
- ``'rbc'``: for the random ball algorithm, which partitions
the data space and uses the triangle inequality to lower the
number of potential distances. Currently, this algorithm
supports 2d Euclidean and Haversine.
supports Haversine (2d) and Euclidean in 2d and 3d.
- ``'brute'``: for brute-force, slow but produces exact results
- ``'ivfflat'``: for inverted file, divide the dataset in partitions
and perform search on relevant partitions only
Expand Down Expand Up @@ -347,15 +348,17 @@ class NearestNeighbors(Base,
self.n_dims = X.shape[1]

if self.algorithm == "auto":
if self.n_dims == 2 and self.metric in \
cuml.neighbors.VALID_METRICS["rbc"]:
if (self.n_dims == 2 or self.n_dims == 3) and \
cjnolet marked this conversation as resolved.
Show resolved Hide resolved
not is_sparse(X) and \
self.metric in cuml.neighbors.VALID_METRICS["rbc"] and \
math.sqrt(X.shape[0]) >= self.n_neighbors:
self.working_algorithm_ = "rbc"
else:
self.working_algorithm_ = "brute"

if self.algorithm == "rbc" and self.n_dims > 2:
if self.algorithm == "rbc" and self.n_dims > 3:
raise ValueError("The rbc algorithm is not supported for"
" >2 dimensions currently.")
" >3 dimensions currently.")

if is_sparse(X):
valid_metrics = cuml.neighbors.VALID_METRICS_SPARSE
Expand Down Expand Up @@ -703,7 +706,16 @@ class NearestNeighbors(Base,
cdef BallCoverIndex[int64_t, float, uint32_t]* rbc_index = \
<BallCoverIndex[int64_t, float, uint32_t]*> 0

if self.working_algorithm_ == 'brute':
fallback_to_brute = self.working_algorithm_ == "rbc" and \
n_neighbors > math.sqrt(self.X_m.shape[0])

if fallback_to_brute:
warnings.warn("algorithm='rbc' requires sqrt(%s) be "
"> n_neighbors (%s). falling back to "
"brute force search" %
(self.X_m.shape[0], n_neighbors))

if self.working_algorithm_ == 'brute' or fallback_to_brute:
inputs.push_back(<float*><uintptr_t>self.X_m.ptr)
sizes.push_back(<int>self.X_m.shape[0])

Expand Down Expand Up @@ -886,12 +898,15 @@ class NearestNeighbors(Base,
def __del__(self):
cdef knnIndex* knn_index = <knnIndex*>0
cdef BallCoverIndex* rbc_index = <BallCoverIndex*>0
if self.knn_index is not None:

kidx = self.__dict__['knn_index'] \
if 'knn_index' in self.__dict__ else None
if kidx is not None:
if self.working_algorithm_ in ["ivfflat", "ivfpq", "ivfsq"]:
knn_index = <knnIndex*><uintptr_t>self.knn_index
knn_index = <knnIndex*><uintptr_t>kidx
del knn_index
else:
rbc_index = <BallCoverIndex*><uintptr_t>self.knn_index
rbc_index = <BallCoverIndex*><uintptr_t>kidx
del rbc_index


Expand Down
4 changes: 2 additions & 2 deletions python/cuml/tests/test_kneighbors_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_nonmonotonic_labels(n_classes, n_rows, n_cols,
@pytest.mark.parametrize("output_type", ["cudf", "numpy", "cupy"])
def test_predict_multioutput(input_type, output_type):

X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32)
y = np.array([[15, 2], [5, 4]]).astype(np.int32)

if input_type == "cudf":
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_predict_multioutput(input_type, output_type):
@pytest.mark.parametrize("output_type", ["cudf", "numpy", "cupy"])
def test_predict_proba_multioutput(input_type, output_type):

X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32)
y = np.array([[15, 2], [5, 4]]).astype(np.int32)

if input_type == "cudf":
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/tests/test_kneighbors_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_score_dtype(dtype):
@pytest.mark.parametrize("output_type", ["cudf", "numpy", "cupy"])
def test_predict_multioutput(input_type, output_type):

X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32)
y = np.array([[15, 2], [5, 4]]).astype(np.int32)

if input_type == "cudf":
Expand Down
18 changes: 13 additions & 5 deletions python/cuml/tests/test_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,21 +516,25 @@ def test_knn_graph(input_type, mode, output_type, as_instance,
assert isspmatrix_csr(sparse_cu)


@pytest.mark.parametrize('distance', ["euclidean", "haversine"])
@pytest.mark.parametrize('distance_dims', [("euclidean", 2),
("euclidean", 3),
("haversine", 2)])
@pytest.mark.parametrize('n_neighbors', [4, 25])
@pytest.mark.parametrize('nrows', [unit_param(10000), stress_param(70000)])
def test_nearest_neighbors_rbc(distance, n_neighbors, nrows):
def test_nearest_neighbors_rbc(distance_dims, n_neighbors, nrows):
distance, dims = distance_dims

X, y = make_blobs(n_samples=nrows,
centers=25,
shuffle=True,
n_features=2,
n_features=dims,
cluster_std=3.0,
random_state=42)

knn_cu = cuKNN(metric=distance, algorithm="rbc")
knn_cu.fit(X)

query_rows = int(nrows/2)
query_rows = int(nrows / 2)

rbc_d, rbc_i = knn_cu.kneighbors(X[:query_rows, :],
n_neighbors=n_neighbors)
Expand All @@ -548,7 +552,11 @@ def test_nearest_neighbors_rbc(distance, n_neighbors, nrows):
X[:query_rows, :], n_neighbors=n_neighbors)

assert len(brute_d[brute_d != rbc_d]) == 0
assert len(brute_i[brute_i != rbc_i]) == 0

# All the distances match so allow a couple mismatched indices
# through from potential non-determinism in exact matching
# distances
assert len(brute_i[brute_i != rbc_i]) <= 3


@pytest.mark.parametrize("metric", valid_metrics_sparse())
Expand Down