KNN from RAFT (#3603)

Closes #3457 This PR switches cuML code to use brute-force KNN from RAFT. Authors: - Victor Lafargue (@viclafargue) Approvers: - Divye Gala (@divyegala) - Corey J. Nolet (@cjnolet) URL: #3603
rapidsai · Mar 30, 2021 · aeda29b · aeda29b
1 parent 4f4ae58
commit aeda29b
Show file tree

Hide file tree

Showing 20 changed files with 187 additions and 615 deletions.
diff --git a/cpp/src/knn/knn.cu b/cpp/src/knn/knn.cu
@@ -20,6 +20,7 @@
 #include <ml_mg_utils.cuh>
 
 #include <label/classlabels.cuh>
+#include <raft/spatial/knn/knn.hpp>
 #include <selection/knn.cuh>
 
 #include <cuda_runtime.h>
@@ -38,13 +39,9 @@ void brute_force_knn(const raft::handle_t &handle, std::vector<float *> &input,
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
 
-  std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
-
-  MLCommon::Selection::brute_force_knn(
-    input, sizes, D, search_items, n, res_I, res_D, k,
-    handle.get_device_allocator(), handle.get_stream(), int_streams.data(),
-    handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr,
-    metric, metric_arg);
+  raft::spatial::knn::brute_force_knn(
+    handle, input, sizes, D, search_items, n, res_I, res_D, k, rowMajorIndex,
+    rowMajorQuery, nullptr, metric, metric_arg);
 }
 
 void approx_knn_build_index(raft::handle_t &handle, ML::knnIndex *index,

diff --git a/cpp/src/knn/knn_classify_mg.cu b/cpp/src/knn/knn_classify_mg.cu
@@ -41,8 +41,7 @@ void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
     rowMajorQuery, k, batch_size, verbose, n_unique.size(), &y, &n_unique,
     &uniq_labels, out, probas);
 
-  cuda_utils cutils(handle);
-  opg_knn(params, cutils);
+  opg_knn(params, handle);
 }
 };  // namespace opg
 };  // namespace KNN

diff --git a/cpp/src/knn/knn_mg.cu b/cpp/src/knn/knn_mg.cu
@@ -35,8 +35,7 @@ void knn(raft::handle_t &handle, std::vector<Matrix::Data<int64_t> *> *out_I,
     knn_operation::knn, &idx_data, &idx_desc, &query_data, &query_desc,
     rowMajorIndex, rowMajorQuery, k, batch_size, verbose, out_D, out_I);
 
-  cuda_utils cutils(handle);
-  opg_knn(params, cutils);
+  opg_knn(params, handle);
 }
 };  // namespace opg
 };  // namespace KNN

diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
diff --git a/cpp/src/knn/knn_regress_mg.cu b/cpp/src/knn/knn_regress_mg.cu
@@ -37,8 +37,7 @@ void knn_regress(raft::handle_t &handle,
     knn_operation::regression, &idx_data, &idx_desc, &query_data, &query_desc,
     rowMajorIndex, rowMajorQuery, k, batch_size, verbose, n_outputs, &y, out);
 
-  cuda_utils cutils(handle);
-  opg_knn(params, cutils);
+  opg_knn(params, handle);
 }
 };  // namespace opg
 };  // namespace KNN

diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
@@ -37,11 +37,8 @@ template <typename math_t, raft::distance::DistanceType distance_type>
 double trustworthiness_score(const raft::handle_t& h, math_t* X,
                              math_t* X_embedded, int n, int m, int d,
                              int n_neighbors, int batchSize) {
-  cudaStream_t stream = h.get_stream();
-  auto d_alloc = h.get_device_allocator();
-
   return MLCommon::Score::trustworthiness_score<math_t, distance_type>(
-    X, X_embedded, n, m, d, n_neighbors, d_alloc, stream, batchSize);
+    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
 }
 
 template double

diff --git a/cpp/src/tsne/distances.cuh b/cpp/src/tsne/distances.cuh
@@ -64,10 +64,9 @@ void get_distances(const raft::handle_t &handle,
                      cudaStream_t userStream,
  */
 
-  MLCommon::Selection::brute_force_knn(input_vec, sizes_vec, input.d, input.X,
-                                       input.n, k_graph.knn_indices,
-                                       k_graph.knn_dists, k_graph.n_neighbors,
-                                       handle.get_device_allocator(), stream);
+  raft::spatial::knn::brute_force_knn(handle, input_vec, sizes_vec, input.d,
+                                      input.X, input.n, k_graph.knn_indices,
+                                      k_graph.knn_dists, k_graph.n_neighbors);
 }
 
 // dense, int32 indices

diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
@@ -61,9 +61,9 @@ void launcher(const raft::handle_t &handle,
   ptrs[0] = inputsA.X;
   sizes[0] = inputsA.n;
 
-  MLCommon::Selection::brute_force_knn(
-    ptrs, sizes, inputsA.d, inputsB.X, inputsB.n, out.knn_indices,
-    out.knn_dists, n_neighbors, d_alloc, stream);
+  raft::spatial::knn::brute_force_knn(handle, ptrs, sizes, inputsA.d, inputsB.X,
+                                      inputsB.n, out.knn_indices, out.knn_dists,
+                                      n_neighbors);
 }
 
 // Instantiation for dense inputs, int indices

diff --git a/cpp/src_prims/metrics/scores.cuh b/cpp/src_prims/metrics/scores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,8 +27,8 @@
 #include <cuml/common/cuml_allocator.hpp>
 
 #include <distance/distance.cuh>
+#include <raft/spatial/knn/knn.hpp>
 #include <selection/columnWiseSort.cuh>
-#include <selection/knn.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
@@ -81,9 +81,11 @@ __global__ void compute_rank(math_t *ind_X, knn_index_t *ind_X_embedded, int n,
  * @return Matrix holding the indices of the nearest neighbors
  */
 template <typename math_t>
-long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
-                      std::shared_ptr<deviceAllocator> d_alloc,
-                      cudaStream_t stream) {
+long *get_knn_indices(const raft::handle_t &h, math_t *input, int n, int d,
+                      int n_neighbors) {
+  cudaStream_t stream = h.get_stream();
+  auto d_alloc = h.get_device_allocator();
+
   long *d_pred_I =
     (int64_t *)d_alloc->allocate(n * n_neighbors * sizeof(int64_t), stream);
   math_t *d_pred_D =
@@ -94,8 +96,8 @@ long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
   ptrs[0] = input;
   sizes[0] = n;
 
-  MLCommon::Selection::brute_force_knn(ptrs, sizes, d, input, n, d_pred_I,
-                                       d_pred_D, n_neighbors, d_alloc, stream);
+  raft::spatial::knn::brute_force_knn(h, ptrs, sizes, d, input, n, d_pred_I,
+                                      d_pred_D, n_neighbors);
 
   d_alloc->deallocate(d_pred_D, n * n_neighbors * sizeof(math_t), stream);
   return d_pred_I;
@@ -116,20 +118,22 @@ long *get_knn_indices(math_t *input, int n, int d, int n_neighbors,
  * @return Trustworthiness score
  */
 template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(math_t *X, math_t *X_embedded, int n, int m, int d,
-                             int n_neighbors,
-                             std::shared_ptr<deviceAllocator> d_alloc,
-                             cudaStream_t stream, int batchSize = 512) {
+double trustworthiness_score(const raft::handle_t &h, math_t *X,
+                             math_t *X_embedded, int n, int m, int d,
+                             int n_neighbors, int batchSize = 512) {
   const int TMP_SIZE = batchSize * n;
 
+  cudaStream_t stream = h.get_stream();
+  auto d_alloc = h.get_device_allocator();
+
   typedef cutlass::Shape<8, 128, 128> OutputTile_t;
 
   math_t *d_pdist_tmp =
     (math_t *)d_alloc->allocate(TMP_SIZE * sizeof(math_t), stream);
   int *d_ind_X_tmp = (int *)d_alloc->allocate(TMP_SIZE * sizeof(int), stream);
 
   int64_t *ind_X_embedded =
-    get_knn_indices(X_embedded, n, d, n_neighbors + 1, d_alloc, stream);
+    get_knn_indices(h, X_embedded, n, d, n_neighbors + 1);
 
   double t_tmp = 0.0;
   double t = 0.0;

diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
@@ -373,187 +373,6 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
   query_metric_processor->postprocess(distances);
 }
 
-/**
- * Search the kNN for the k-nearest neighbors of a set of query vectors
- * @param[in] input vector of device device memory array pointers to search
- * @param[in] sizes vector of memory sizes for each device array pointer in input
- * @param[in] D number of cols in input and search_items
- * @param[in] search_items set of vectors to query for neighbors
- * @param[in] n        number of items in search_items
- * @param[out] res_I    pointer to device memory for returning k nearest indices
- * @param[out] res_D    pointer to device memory for returning k nearest distances
- * @param[in] k        number of neighbors to query
- * @param[in] allocator the device memory allocator to use for temporary scratch memory
- * @param[in] userStream the main cuda stream to use
- * @param[in] internalStreams optional when n_params > 0, the index partitions can be
- *        queried in parallel using these streams. Note that n_int_streams also
- *        has to be > 0 for these to be used and their cardinality does not need
- *        to correspond to n_parts.
- * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the
- *        user stream will be used.
- * @param[in] rowMajorIndex are the index arrays in row-major layout?
- * @param[in] rowMajorQuery are the query array in row-major layout?
- * @param[in] translations translation ids for indices when index rows represent
- *        non-contiguous partitions
- * @param[in] metric corresponds to the raft::distance::DistanceType enum (default is L2Expanded)
- * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
- */
-template <typename IntType = int>
-void brute_force_knn(std::vector<float *> &input, std::vector<int> &sizes,
-                     IntType D, float *search_items, IntType n, int64_t *res_I,
-                     float *res_D, IntType k,
-                     std::shared_ptr<deviceAllocator> allocator,
-                     cudaStream_t userStream,
-                     cudaStream_t *internalStreams = nullptr,
-                     int n_int_streams = 0, bool rowMajorIndex = true,
-                     bool rowMajorQuery = true,
-                     std::vector<int64_t> *translations = nullptr,
-                     raft::distance::DistanceType metric =
-                       raft::distance::DistanceType::L2Expanded,
-                     float metricArg = 0) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors should be the same size");
-
-  std::vector<int64_t> *id_ranges;
-  if (translations == nullptr) {
-    // If we don't have explicit translations
-    // for offsets of the indices, build them
-    // from the local partitions
-    id_ranges = new std::vector<int64_t>();
-    int64_t total_n = 0;
-    for (int i = 0; i < input.size(); i++) {
-      id_ranges->push_back(total_n);
-      total_n += sizes[i];
-    }
-  } else {
-    // otherwise, use the given translations
-    id_ranges = translations;
-  }
-
-  // perform preprocessing
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
-                            allocator);
-  query_metric_processor->preprocess(search_items);
-
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
-    input.size());
-  for (int i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(
-      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
-    metric_processors[i]->preprocess(input[i]);
-  }
-
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-
-  device_buffer<int64_t> trans(allocator, userStream, id_ranges->size());
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
-                      userStream);
-
-  device_buffer<float> all_D(allocator, userStream, 0);
-  device_buffer<int64_t> all_I(allocator, userStream, 0);
-
-  float *out_D = res_D;
-  int64_t *out_I = res_I;
-
-  if (input.size() > 1) {
-    all_D.resize(input.size() * k * n, userStream);
-    all_I.resize(input.size() * k * n, userStream);
-
-    out_D = all_D.data();
-    out_I = all_I.data();
-  }
-
-  // Sync user stream only if using other streams to parallelize query
-  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
-
-  for (int i = 0; i < input.size(); i++) {
-    float *out_d_ptr = out_D + (i * k * n);
-    int64_t *out_i_ptr = out_I + (i * k * n);
-
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
-
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        raft::selection::haversine_knn(out_i_ptr, out_d_ptr, input[i],
-                                       search_items, sizes[i], n, k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric = m;
-        args.metricArg = metricArg;
-        args.k = k;
-        args.dims = D;
-        args.vectors = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors = sizes[i];
-        args.queries = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries = n;
-        args.outDistances = out_d_ptr;
-        args.outIndices = out_i_ptr;
-
-        /**
-         * @todo: Until FAISS supports pluggable allocation strategies,
-         * we will not reap the benefits of the pool allocator for
-         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-         */
-        bfKnn(&gpu_res, args);
-    }
-  }
-
-  // Sync internal streams if used. We don't need to
-  // sync the user stream because we'll already have
-  // fully serial execution.
-  for (int i = 0; i < n_int_streams; i++) {
-    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
-  }
-
-  if (input.size() > 1 || translations != nullptr) {
-    // This is necessary for proper index translations. If there are
-    // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
-  }
-
-  // Perform necessary post-processing
-  if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
-      metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-      metric == raft::distance::DistanceType::LpUnexpanded) {
-    /**
-	* post-processing
-	*/
-    float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / metricArg;
-    raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k,
-      [p] __device__(float input) { return powf(input, p); }, userStream);
-  }
-
-  query_metric_processor->revert(search_items);
-  query_metric_processor->postprocess(out_D);
-  for (int i = 0; i < input.size(); i++) {
-    metric_processors[i]->revert(input[i]);
-  }
-
-  if (translations == nullptr) delete id_ranges;
-};
-
 template <typename OutType = float, bool precomp_lbls = false>
 __global__ void class_probs_kernel(OutType *out, const int64_t *knn_indices,
                                    const int *labels, int n_uniq_labels,

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
@@ -168,7 +168,6 @@ if(BUILD_PRIMS_TESTS)
     prims/gather.cu
     prims/gram.cu
     prims/grid_sync.cu
-    prims/haversine_knn.cu
     prims/hinge.cu
     prims/histogram.cu
     prims/homogeneity_score.cu
@@ -177,7 +176,6 @@ if(BUILD_PRIMS_TESTS)
     prims/kl_divergence.cu
     prims/knn_classify.cu
     prims/knn_regression.cu
-    prims/knn.cu
     prims/kselection.cu
     prims/label.cu
     prims/linearReg.cu