Skip to content

Commit

Permalink
KNN from RAFT (#3603)
Browse files Browse the repository at this point in the history
Closes #3457

This PR switches cuML code to use brute-force KNN from RAFT.

Authors:
  - Victor Lafargue (@viclafargue)

Approvers:
  - Divye Gala (@divyegala)
  - Corey J. Nolet (@cjnolet)

URL: #3603
  • Loading branch information
viclafargue authored Mar 30, 2021
1 parent 4f4ae58 commit aeda29b
Show file tree
Hide file tree
Showing 20 changed files with 187 additions and 615 deletions.
11 changes: 4 additions & 7 deletions cpp/src/knn/knn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <ml_mg_utils.cuh>

#include <label/classlabels.cuh>
#include <raft/spatial/knn/knn.hpp>
#include <selection/knn.cuh>

#include <cuda_runtime.h>
Expand All @@ -38,13 +39,9 @@ void brute_force_knn(const raft::handle_t &handle, std::vector<float *> &input,
ASSERT(input.size() == sizes.size(),
"input and sizes vectors must be the same size");

std::vector<cudaStream_t> int_streams = handle.get_internal_streams();

MLCommon::Selection::brute_force_knn(
input, sizes, D, search_items, n, res_I, res_D, k,
handle.get_device_allocator(), handle.get_stream(), int_streams.data(),
handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr,
metric, metric_arg);
raft::spatial::knn::brute_force_knn(
handle, input, sizes, D, search_items, n, res_I, res_D, k, rowMajorIndex,
rowMajorQuery, nullptr, metric, metric_arg);
}

void approx_knn_build_index(raft::handle_t &handle, ML::knnIndex *index,
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/knn/knn_classify_mg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
rowMajorQuery, k, batch_size, verbose, n_unique.size(), &y, &n_unique,
&uniq_labels, out, probas);

cuda_utils cutils(handle);
opg_knn(params, cutils);
opg_knn(params, handle);
}
}; // namespace opg
}; // namespace KNN
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/knn/knn_mg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ void knn(raft::handle_t &handle, std::vector<Matrix::Data<int64_t> *> *out_I,
knn_operation::knn, &idx_data, &idx_desc, &query_data, &query_desc,
rowMajorIndex, rowMajorQuery, k, batch_size, verbose, out_D, out_I);

cuda_utils cutils(handle);
opg_knn(params, cutils);
opg_knn(params, handle);
}
}; // namespace opg
}; // namespace KNN
Expand Down
250 changes: 124 additions & 126 deletions cpp/src/knn/knn_opg_common.cuh

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions cpp/src/knn/knn_regress_mg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ void knn_regress(raft::handle_t &handle,
knn_operation::regression, &idx_data, &idx_desc, &query_data, &query_desc,
rowMajorIndex, rowMajorQuery, k, batch_size, verbose, n_outputs, &y, out);

cuda_utils cutils(handle);
opg_knn(params, cutils);
opg_knn(params, handle);
}
}; // namespace opg
}; // namespace KNN
Expand Down
5 changes: 1 addition & 4 deletions cpp/src/metrics/trustworthiness.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,8 @@ template <typename math_t, raft::distance::DistanceType distance_type>
double trustworthiness_score(const raft::handle_t& h, math_t* X,
math_t* X_embedded, int n, int m, int d,
int n_neighbors, int batchSize) {
cudaStream_t stream = h.get_stream();
auto d_alloc = h.get_device_allocator();

return MLCommon::Score::trustworthiness_score<math_t, distance_type>(
X, X_embedded, n, m, d, n_neighbors, d_alloc, stream, batchSize);
h, X, X_embedded, n, m, d, n_neighbors, batchSize);
}

template double
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/tsne/distances.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,9 @@ void get_distances(const raft::handle_t &handle,
cudaStream_t userStream,
*/

MLCommon::Selection::brute_force_knn(input_vec, sizes_vec, input.d, input.X,
input.n, k_graph.knn_indices,
k_graph.knn_dists, k_graph.n_neighbors,
handle.get_device_allocator(), stream);
raft::spatial::knn::brute_force_knn(handle, input_vec, sizes_vec, input.d,
input.X, input.n, k_graph.knn_indices,
k_graph.knn_dists, k_graph.n_neighbors);
}

// dense, int32 indices
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/umap/knn_graph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ void launcher(const raft::handle_t &handle,
ptrs[0] = inputsA.X;
sizes[0] = inputsA.n;

MLCommon::Selection::brute_force_knn(
ptrs, sizes, inputsA.d, inputsB.X, inputsB.n, out.knn_indices,
out.knn_dists, n_neighbors, d_alloc, stream);
raft::spatial::knn::brute_force_knn(handle, ptrs, sizes, inputsA.d, inputsB.X,
inputsB.n, out.knn_indices, out.knn_dists,
n_neighbors);
}

// Instantiation for dense inputs, int indices
Expand Down
28 changes: 16 additions & 12 deletions cpp/src_prims/metrics/scores.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,8 +27,8 @@
#include <cuml/common/cuml_allocator.hpp>

#include <distance/distance.cuh>
#include <raft/spatial/knn/knn.hpp>
#include <selection/columnWiseSort.cuh>
#include <selection/knn.cuh>

#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
Expand Down Expand Up @@ -81,9 +81,11 @@ __global__ void compute_rank(math_t *ind_X, knn_index_t *ind_X_embedded, int n,
* @return Matrix holding the indices of the nearest neighbors
*/
template <typename math_t>
long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
std::shared_ptr<deviceAllocator> d_alloc,
cudaStream_t stream) {
long *get_knn_indices(const raft::handle_t &h, math_t *input, int n, int d,
int n_neighbors) {
cudaStream_t stream = h.get_stream();
auto d_alloc = h.get_device_allocator();

long *d_pred_I =
(int64_t *)d_alloc->allocate(n * n_neighbors * sizeof(int64_t), stream);
math_t *d_pred_D =
Expand All @@ -94,8 +96,8 @@ long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
ptrs[0] = input;
sizes[0] = n;

MLCommon::Selection::brute_force_knn(ptrs, sizes, d, input, n, d_pred_I,
d_pred_D, n_neighbors, d_alloc, stream);
raft::spatial::knn::brute_force_knn(h, ptrs, sizes, d, input, n, d_pred_I,
d_pred_D, n_neighbors);

d_alloc->deallocate(d_pred_D, n * n_neighbors * sizeof(math_t), stream);
return d_pred_I;
Expand All @@ -116,20 +118,22 @@ long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
* @return Trustworthiness score
*/
template <typename math_t, raft::distance::DistanceType distance_type>
double trustworthiness_score(math_t *X, math_t *X_embedded, int n, int m, int d,
int n_neighbors,
std::shared_ptr<deviceAllocator> d_alloc,
cudaStream_t stream, int batchSize = 512) {
double trustworthiness_score(const raft::handle_t &h, math_t *X,
math_t *X_embedded, int n, int m, int d,
int n_neighbors, int batchSize = 512) {
const int TMP_SIZE = batchSize * n;

cudaStream_t stream = h.get_stream();
auto d_alloc = h.get_device_allocator();

typedef cutlass::Shape<8, 128, 128> OutputTile_t;

math_t *d_pdist_tmp =
(math_t *)d_alloc->allocate(TMP_SIZE * sizeof(math_t), stream);
int *d_ind_X_tmp = (int *)d_alloc->allocate(TMP_SIZE * sizeof(int), stream);

int64_t *ind_X_embedded =
get_knn_indices(X_embedded, n, d, n_neighbors + 1, d_alloc, stream);
get_knn_indices(h, X_embedded, n, d, n_neighbors + 1);

double t_tmp = 0.0;
double t = 0.0;
Expand Down
181 changes: 0 additions & 181 deletions cpp/src_prims/selection/knn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -373,187 +373,6 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
query_metric_processor->postprocess(distances);
}

/**
* Search the kNN for the k-nearest neighbors of a set of query vectors
* @param[in] input vector of device device memory array pointers to search
* @param[in] sizes vector of memory sizes for each device array pointer in input
* @param[in] D number of cols in input and search_items
* @param[in] search_items set of vectors to query for neighbors
* @param[in] n number of items in search_items
* @param[out] res_I pointer to device memory for returning k nearest indices
* @param[out] res_D pointer to device memory for returning k nearest distances
* @param[in] k number of neighbors to query
* @param[in] allocator the device memory allocator to use for temporary scratch memory
* @param[in] userStream the main cuda stream to use
* @param[in] internalStreams optional when n_params > 0, the index partitions can be
* queried in parallel using these streams. Note that n_int_streams also
* has to be > 0 for these to be used and their cardinality does not need
* to correspond to n_parts.
* @param[in] n_int_streams size of internalStreams. When this is <= 0, only the
* user stream will be used.
* @param[in] rowMajorIndex are the index arrays in row-major layout?
* @param[in] rowMajorQuery are the query array in row-major layout?
* @param[in] translations translation ids for indices when index rows represent
* non-contiguous partitions
* @param[in] metric corresponds to the raft::distance::DistanceType enum (default is L2Expanded)
* @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
*/
template <typename IntType = int>
void brute_force_knn(std::vector<float *> &input, std::vector<int> &sizes,
IntType D, float *search_items, IntType n, int64_t *res_I,
float *res_D, IntType k,
std::shared_ptr<deviceAllocator> allocator,
cudaStream_t userStream,
cudaStream_t *internalStreams = nullptr,
int n_int_streams = 0, bool rowMajorIndex = true,
bool rowMajorQuery = true,
std::vector<int64_t> *translations = nullptr,
raft::distance::DistanceType metric =
raft::distance::DistanceType::L2Expanded,
float metricArg = 0) {
ASSERT(input.size() == sizes.size(),
"input and sizes vectors should be the same size");

std::vector<int64_t> *id_ranges;
if (translations == nullptr) {
// If we don't have explicit translations
// for offsets of the indices, build them
// from the local partitions
id_ranges = new std::vector<int64_t>();
int64_t total_n = 0;
for (int i = 0; i < input.size(); i++) {
id_ranges->push_back(total_n);
total_n += sizes[i];
}
} else {
// otherwise, use the given translations
id_ranges = translations;
}

// perform preprocessing
std::unique_ptr<MetricProcessor<float>> query_metric_processor =
create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
allocator);
query_metric_processor->preprocess(search_items);

std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
input.size());
for (int i = 0; i < input.size(); i++) {
metric_processors[i] = create_processor<float>(
metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
metric_processors[i]->preprocess(input[i]);
}

int device;
CUDA_CHECK(cudaGetDevice(&device));

device_buffer<int64_t> trans(allocator, userStream, id_ranges->size());
raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
userStream);

device_buffer<float> all_D(allocator, userStream, 0);
device_buffer<int64_t> all_I(allocator, userStream, 0);

float *out_D = res_D;
int64_t *out_I = res_I;

if (input.size() > 1) {
all_D.resize(input.size() * k * n, userStream);
all_I.resize(input.size() * k * n, userStream);

out_D = all_D.data();
out_I = all_I.data();
}

// Sync user stream only if using other streams to parallelize query
if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));

for (int i = 0; i < input.size(); i++) {
float *out_d_ptr = out_D + (i * k * n);
int64_t *out_i_ptr = out_I + (i * k * n);

cudaStream_t stream =
raft::select_stream(userStream, internalStreams, n_int_streams, i);

switch (metric) {
case raft::distance::DistanceType::Haversine:

ASSERT(D == 2,
"Haversine distance requires 2 dimensions "
"(latitude / longitude).");

raft::selection::haversine_knn(out_i_ptr, out_d_ptr, input[i],
search_items, sizes[i], n, k, stream);
break;
default:
faiss::MetricType m = build_faiss_metric(metric);

faiss::gpu::StandardGpuResources gpu_res;

gpu_res.noTempMemory();
gpu_res.setDefaultStream(device, stream);

faiss::gpu::GpuDistanceParams args;
args.metric = m;
args.metricArg = metricArg;
args.k = k;
args.dims = D;
args.vectors = input[i];
args.vectorsRowMajor = rowMajorIndex;
args.numVectors = sizes[i];
args.queries = search_items;
args.queriesRowMajor = rowMajorQuery;
args.numQueries = n;
args.outDistances = out_d_ptr;
args.outIndices = out_i_ptr;

/**
* @todo: Until FAISS supports pluggable allocation strategies,
* we will not reap the benefits of the pool allocator for
* avoiding device-wide synchronizations from cudaMalloc/cudaFree
*/
bfKnn(&gpu_res, args);
}
}

// Sync internal streams if used. We don't need to
// sync the user stream because we'll already have
// fully serial execution.
for (int i = 0; i < n_int_streams; i++) {
CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
}

if (input.size() > 1 || translations != nullptr) {
// This is necessary for proper index translations. If there are
// no translations or partitions to combine, it can be skipped.
knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
trans.data());
}

// Perform necessary post-processing
if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
metric == raft::distance::DistanceType::LpUnexpanded) {
/**
* post-processing
*/
float p = 0.5; // standard l2
if (metric == raft::distance::DistanceType::LpUnexpanded)
p = 1.0 / metricArg;
raft::linalg::unaryOp<float>(
res_D, res_D, n * k,
[p] __device__(float input) { return powf(input, p); }, userStream);
}

query_metric_processor->revert(search_items);
query_metric_processor->postprocess(out_D);
for (int i = 0; i < input.size(); i++) {
metric_processors[i]->revert(input[i]);
}

if (translations == nullptr) delete id_ranges;
};

template <typename OutType = float, bool precomp_lbls = false>
__global__ void class_probs_kernel(OutType *out, const int64_t *knn_indices,
const int *labels, int n_uniq_labels,
Expand Down
2 changes: 0 additions & 2 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ if(BUILD_PRIMS_TESTS)
prims/gather.cu
prims/gram.cu
prims/grid_sync.cu
prims/haversine_knn.cu
prims/hinge.cu
prims/histogram.cu
prims/homogeneity_score.cu
Expand All @@ -177,7 +176,6 @@ if(BUILD_PRIMS_TESTS)
prims/kl_divergence.cu
prims/knn_classify.cu
prims/knn_regression.cu
prims/knn.cu
prims/kselection.cu
prims/label.cu
prims/linearReg.cu
Expand Down
Loading

0 comments on commit aeda29b

Please sign in to comment.