From f090fc0fd46909394145ed333546cf5979abac69 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 10 Dec 2020 16:15:59 -0500
Subject: [PATCH 01/19] add gather & gatherv to raft::comms_t

---
 cpp/include/raft/comms/comms.hpp     | 48 ++++++++++++++++++++++++++++
 cpp/include/raft/comms/mpi_comms.hpp | 31 ++++++++++++++++++
 cpp/include/raft/comms/std_comms.hpp | 31 ++++++++++++++++++
 3 files changed, 110 insertions(+)
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 73e52e781b..0ca9f3972f 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -130,6 +130,15 @@ class comms_iface {
                           const size_t* recvcounts, const size_t* displs,
                           datatype_t datatype, cudaStream_t stream) const = 0;
 
+  virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                      datatype_t datatype, int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
+                       const size_t* recvcounts, const size_t* displs,
+                       datatype_t datatype, int root,
+                       cudaStream_t stream) const = 0;
+
   virtual void reducescatter(const void* sendbuff, void* recvbuff,
                              size_t recvcount, datatype_t datatype, op_t op,
                              cudaStream_t stream) const = 0;
@@ -316,6 +325,45 @@ class comms_t {
                       get_type<value_t>(), stream);
   }
 
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
+              int root, cudaStream_t stream) const {
+    impl_->gather(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
+                  root, stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @param value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to send
+   * @param recvbuff buffer containing data to receive
+   * @param sendcount number of elements in send buffer
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
+               const size_t* recvcounts, const size_t* displs, int root,
+               cudaStream_t stream) const {
+    impl_->gatherv(static_cast<const void*>(sendbuf),
+                   static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
+                   get_type<value_t>(), root, stream);
+  }
+
   /**
    * Reduces data from all ranks then scatters the result across ranks
    * @tparam value_t datatype of underlying buffers
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index a372702c34..dddc709fa7 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -232,6 +232,37 @@ class mpi_comms : public comms_iface {
     }
   }
 
+  void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
+              datatype_t datatype, int root, cudaStream_t stream) const {
+    size_t dtype_size = get_datatype_size(datatype);
+    NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount,
+                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+      }
+    }
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+                      nccl_comm_, stream));
+    NCCL_TRY(ncclGroupEnd());
+  }
+
+  void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
+               const size_t* recvcounts, const size_t* displs,
+               datatype_t datatype, int root, cudaStream_t stream) const {
+    size_t dtype_size = get_datatype_size(datatype);
+    NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r],
+                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+      }
+    }
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+                      nccl_comm_, stream));
+    NCCL_TRY(ncclGroupEnd());
+  }
+
   void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
                      datatype_t datatype, op_t op, cudaStream_t stream) const {
     NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index d4b9d2ba39..3cafdc87c0 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -346,6 +346,37 @@ class std_comms : public comms_iface {
     }
   }
 
+  void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
+              datatype_t datatype, int root, cudaStream_t stream) const {
+    size_t dtype_size = get_datatype_size(datatype);
+    NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount,
+                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+      }
+    }
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+                      nccl_comm_, stream));
+    NCCL_TRY(ncclGroupEnd());
+  }
+
+  void gatherv(const void *sendbuf, void *recvbuf, size_t sendcount,
+               const size_t *recvcounts, const size_t *displs,
+               datatype_t datatype, int root, cudaStream_t stream) const {
+    size_t dtype_size = get_datatype_size(datatype);
+    NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r],
+                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+      }
+    }
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+                      nccl_comm_, stream));
+    NCCL_TRY(ncclGroupEnd());
+  }
+
   void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
                      datatype_t datatype, op_t op, cudaStream_t stream) const {
     NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,

From 7a46dc9d2280b3a0f899ce2b8c5799f19395464c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 26 Jan 2021 15:58:41 -0500
Subject: [PATCH 02/19] fix build errors

---
 cpp/include/raft/comms/mpi_comms.hpp | 16 +++++++++-------
 cpp/include/raft/comms/std_comms.hpp | 16 +++++++++-------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index dddc709fa7..8aebcc80cc 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -238,27 +238,29 @@ class mpi_comms : public comms_iface {
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount,
-                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
                       nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
+  void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
                const size_t* recvcounts, const size_t* displs,
                datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r],
-                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                          recvcounts[r], get_nccl_datatype(datatype), r,
+                          nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
                       nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 3cafdc87c0..a304955ceb 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -352,27 +352,29 @@ class std_comms : public comms_iface {
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount,
-                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
                       nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void *sendbuf, void *recvbuf, size_t sendcount,
+  void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
                const size_t *recvcounts, const size_t *displs,
                datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r],
-                          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r,
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
                       nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }

From e6180740181c986b11d5a3557bbab76932805530 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 26 Jan 2021 23:42:42 -0500
Subject: [PATCH 03/19] fix a bug in reducescatter test

---
 cpp/include/raft/comms/test.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index fa7e471174..627d629e68 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -158,23 +158,23 @@ bool test_collective_allgather(const handle_t &handle, int root) {
 bool test_collective_reducescatter(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
-  int const send = 1;
+  std::vector<int> sends(communicator.get_size(), 1);
 
   cudaStream_t stream = handle.get_stream();
 
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream,
-                                       1);
+                                       sends.size());
   raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
                                        1);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int),
                              cudaMemcpyHostToDevice, stream));
 
   communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
                              stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int),
                              cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();

From d390789761098d88193cd325e81712173d13d14d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 27 Jan 2021 00:20:38 -0500
Subject: [PATCH 04/19] add python gather & gatherv tests

---
 cpp/include/raft/comms/test.hpp         | 96 ++++++++++++++++++++++++-
 python/raft/dask/common/__init__.py     |  2 +
 python/raft/dask/common/comms_utils.pyx | 32 +++++++++
 python/raft/test/test_comms.py          |  4 ++
 4 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 627d629e68..5dc6f02d21 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -16,11 +16,13 @@
 
 #pragma once
 
-#include <iostream>
 #include <raft/comms/comms.hpp>
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
 
+#include <iostream>
+#include <numeric>
+
 namespace raft {
 namespace comms {
 
@@ -155,6 +157,93 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   return true;
 }
 
+bool test_collective_gather(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
+
+  int const send = communicator.get_rank();
+
+  cudaStream_t stream = handle.get_stream();
+
+  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
+  temp_d.resize(1, stream);
+
+  raft::mr::device::buffer<int> recv_d(
+    handle.get_device_allocator(), stream,
+    communicator.get_rank() == root ? communicator.get_size() : 0);
+
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+                             cudaMemcpyHostToDevice, stream));
+
+  communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
+  communicator.sync_stream(stream);
+
+  if (communicator.get_rank() == root) {
+    std::vector<int> temp_h(communicator.get_size(), 0);
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+                               sizeof(int) * temp_h.size(),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < communicator.get_size(); i++) {
+      if (temp_h[i] != i) return false;
+    }
+  }
+  return true;
+}
+
+bool test_collective_gatherv(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
+
+  std::vector<size_t> sendcounts(communicator.get_size());
+  std::iota(sendcounts.begin(), sendcounts.end(), size_t{1});
+  std::vector<size_t> displacements(communicator.get_size() + 1, 0);
+  std::partial_sum(sendcounts.begin(), sendcounts.end(),
+                   displacements.begin() + 1);
+
+  std::vector<int> sends(displacements[communicator.get_rank() + 1] -
+                           displacements[communicator.get_rank()],
+                         communicator.get_rank());
+
+  cudaStream_t stream = handle.get_stream();
+
+  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
+  temp_d.resize(sends.size(), stream);
+
+  raft::mr::device::buffer<int> recv_d(
+    handle.get_device_allocator(), stream,
+    communicator.get_rank() == root ? displacements.back() : 0);
+
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
+                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
+                             stream));
+
+  communicator.gatherv(
+    temp_d.data(), recv_d.data(), temp_d.size(),
+    communicator.get_rank() == root ? sendcounts.data()
+                                    : static_cast<size_t *>(nullptr),
+    communicator.get_rank() == root ? displacements.data()
+                                    : static_cast<size_t *>(nullptr),
+    root, stream);
+  communicator.sync_stream(stream);
+
+  if (communicator.get_rank() == root) {
+    std::vector<int> temp_h(displacements.back(), 0);
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+                               sizeof(int) * displacements.back(),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < communicator.get_size(); i++) {
+      if (std::count_if(temp_h.begin() + displacements[i],
+                        temp_h.begin() + displacements[i + 1],
+                        [i](auto val) { return val != i; }) != 0) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 bool test_collective_reducescatter(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -167,8 +256,9 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
   raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
                                        1);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
+                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
+                             stream));
 
   communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
                              stream);
diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py
index 788af46c92..73bb5d6700 100644
--- a/python/raft/dask/common/__init__.py
+++ b/python/raft/dask/common/__init__.py
@@ -21,6 +21,8 @@
 from .comms_utils import perform_test_comms_allreduce
 from .comms_utils import perform_test_comms_send_recv
 from .comms_utils import perform_test_comms_allgather
+from .comms_utils import perform_test_comms_gather
+from .comms_utils import perform_test_comms_gatherv
 from .comms_utils import perform_test_comms_bcast
 from .comms_utils import perform_test_comms_reduce
 from .comms_utils import perform_test_comms_reducescatter
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 4dbd2f1a7c..1a703485a9 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -60,6 +60,8 @@ cdef extern from "raft/comms/test.hpp" namespace "raft::comms":
     bool test_collective_broadcast(const handle_t &h, int root) except +
     bool test_collective_reduce(const handle_t &h, int root) except +
     bool test_collective_allgather(const handle_t &h, int root) except +
+    bool test_collective_gather(const handle_t &h, int root) except +
+    bool test_collective_gatherv(const handle_t &h, int root) except +
     bool test_collective_reducescatter(const handle_t &h, int root) except +
     bool test_pointToPoint_simple_send_recv(const handle_t &h,
                                             int numTrials) except +
@@ -131,6 +133,36 @@ def perform_test_comms_allgather(handle, root):
     return test_collective_allgather(deref(h), root)
 
 
+def perform_test_comms_gather(handle, root):
+    """
+    Performs a gather on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    root : int
+           Rank of the root worker
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_gather(deref(h), root)
+
+
+def perform_test_comms_gatherv(handle, root):
+    """
+    Performs a gatherv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    root : int
+           Rank of the root worker
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_gatherv(deref(h), root)
+
+
 def perform_test_comms_send_recv(handle, n_trials):
     """
     Performs a p2p send/recv on the current worker
diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py
index 7dccb7bbae..a0db3b7f4f 100644
--- a/python/raft/test/test_comms.py
+++ b/python/raft/test/test_comms.py
@@ -28,6 +28,8 @@
     from raft.dask.common import perform_test_comms_bcast
     from raft.dask.common import perform_test_comms_reduce
     from raft.dask.common import perform_test_comms_allgather
+    from raft.dask.common import perform_test_comms_gather
+    from raft.dask.common import perform_test_comms_gatherv
     from raft.dask.common import perform_test_comms_reducescatter
     from raft.dask.common import perform_test_comm_split
 
@@ -130,6 +132,8 @@ def _has_handle(sessionId):
         perform_test_comms_allgather,
         perform_test_comms_allreduce,
         perform_test_comms_bcast,
+        perform_test_comms_gather,
+        perform_test_comms_gatherv,
         perform_test_comms_reduce,
         perform_test_comms_reducescatter,
     ]

From 6a915c421b38cb1556b7fac1b162054b389e1785 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 6 Jan 2021 21:25:44 -0600
Subject: [PATCH 05/19] Add brute force knn

---
 build.sh                                      |   8 +-
 cpp/CMakeLists.txt                            |  19 +-
 cpp/cmake/Dependencies.cmake                  |  44 +++-
 cpp/cmake/faiss_cuda11.patch                  |  40 +++
 cpp/cmake/templates/CMakeLists.txt            |  51 ++++
 cpp/cmake/templates/Findpkg.cmake.in          |  98 ++++++++
 .../spatial/knn/detail/brute_force_knn.hpp    | 163 ++++++++++++
 cpp/include/raft/spatial/knn/detail/utils.hpp |  39 +++
 cpp/include/raft/spatial/knn/knn.hpp          | 237 ++++++++++++++++++
 cpp/test/spatial/knn.cu                       | 123 +++++++++
 10 files changed, 819 insertions(+), 3 deletions(-)
 create mode 100644 cpp/cmake/faiss_cuda11.patch
 create mode 100644 cpp/cmake/templates/CMakeLists.txt
 create mode 100644 cpp/cmake/templates/Findpkg.cmake.in
 create mode 100644 cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
 create mode 100644 cpp/include/raft/spatial/knn/detail/utils.hpp
 create mode 100644 cpp/include/raft/spatial/knn/knn.hpp
 create mode 100644 cpp/test/spatial/knn.cu

diff --git a/build.sh b/build.sh
index 213aea9347..b05e002788 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest"
+VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -29,6 +29,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    -v               - verbose build mode
    -g               - build for debug
    --allgpuarch     - build for all supported GPU architectures
+   --buildfaiss     - build faiss statically into raft
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -44,6 +45,7 @@ BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 VERBOSE=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_GTEST=OFF
+BUILD_STATIC_FAISS=OFF
 SINGLEGPU=""
 NVTX=OFF
 CLEAN=0
@@ -89,6 +91,9 @@ fi
 if hasArg --buildgtest; then
     BUILD_GTEST=ON
 fi
+if hasArg --buildfaiss; then
+      BUILD_STATIC_FAISS=ON
+fi
 if hasArg --singlegpu; then
     SINGLEGPU="--singlegpu"
 fi
@@ -140,6 +145,7 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft; then
           -DNCCL_PATH=${INSTALL_PREFIX} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DBUILD_GTEST=${BUILD_GTEST} \
+          -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \
           ..
 
 fi
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e62b4d11cc..691fbfdfd4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -42,6 +42,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_GTEST "Build the GTEST library for running raft test executables" OFF)
 
+option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF)
+
 option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
 
 option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" ON)
@@ -65,10 +67,22 @@ set(GPU_ARCHS "" CACHE STRING
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
-find_package(CUDA 10.0 REQUIRED)
+# Create FindPackage.cmake files to use find(package) functionality for
+# dependencies
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates)
+
+if(NOT BUILD_STATIC_FAISS)
+  GENERATE_FIND_MODULE(
+    NAME         FAISS
+    HEADER_NAME  faiss/IndexFlat.h
+    LIBRARY_NAME faiss)
+endif(NOT BUILD_STATIC_FAISS)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
+find_package(CUDA 10.0 REQUIRED)
+
 ##############################################################################
 # - Compiler Options  --------------------------------------------------------
 
@@ -196,6 +210,7 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING
 set(RAFT_INCLUDE_DIRECTORIES
   ${RAFT_INCLUDE_DIR}
   ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  ${FAISS_INCLUDE_DIRS}
   ${RMM_INCLUDE_DIRS})
 
 if(NOT CUB_IS_PART_OF_CTK)
@@ -211,6 +226,7 @@ endif(DEFINED ENV{CONDA_PREFIX})
 # - libraries ----------------------------------------------------------------
 
 set(RAFT_LINK_LIBRARIES
+  FAISS::FAISS
   ${CUDA_cublas_LIBRARY}
   ${CUDA_cusolver_LIBRARY}
   ${CUDA_CUDART_LIBRARY}
@@ -261,6 +277,7 @@ if(BUILD_RAFT_TESTS)
     test/random/rng.cu
     test/random/rng_int.cu
     test/random/sample_without_replacement.cu
+    test/spatial/knn.cu
     test/stats/mean.cu
     test/stats/mean_center.cu
     test/stats/stddev.cu
diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
index 64033327d6..c5dd705003 100644
--- a/cpp/cmake/Dependencies.cmake
+++ b/cpp/cmake/Dependencies.cmake
@@ -30,6 +30,46 @@ if(NOT CUB_IS_PART_OF_CTK)
     INSTALL_COMMAND   "")
 endif(NOT CUB_IS_PART_OF_CTK)
 
+##############################################################################
+# - faiss --------------------------------------------------------------------
+
+if(BUILD_STATIC_FAISS)
+  set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING
+    "Path to FAISS source directory")
+  ExternalProject_Add(faiss
+    GIT_REPOSITORY    https://github.com/facebookresearch/faiss.git
+    GIT_TAG           a5b850dec6f1cd6c88ab467bfd5e87b0cac2e41d
+    CONFIGURE_COMMAND LIBS=-pthread
+                      CPPFLAGS=-w
+                      LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib
+                              ${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure
+	                      --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss
+	                      --with-blas=${BLAS_LIBRARIES}
+	                      --with-cuda=${CUDA_TOOLKIT_ROOT_DIR}
+	                      --with-cuda-arch=${FAISS_GPU_ARCHS}
+	                      -v
+    PREFIX            ${FAISS_DIR}
+    BUILD_COMMAND     make -j${PARALLEL_LEVEL} VERBOSE=1
+    BUILD_BYPRODUCTS  ${FAISS_DIR}/lib/libfaiss.a
+    BUILD_ALWAYS      1
+    INSTALL_COMMAND   make -s install > /dev/null
+    UPDATE_COMMAND    ""
+    BUILD_IN_SOURCE   1
+    PATCH_COMMAND     patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/faiss_cuda11.patch || true)
+
+  ExternalProject_Get_Property(faiss install_dir)
+  add_library(FAISS::FAISS STATIC IMPORTED)
+  set_property(TARGET FAISS::FAISS PROPERTY
+    IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a)
+  # to account for the FAISS file reorg that happened recently after the current
+  # pinned commit, just change the following line to
+  # set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss")
+  set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src")
+else()
+  set(FAISS_INSTALL_DIR ENV{FAISS_ROOT})
+  find_package(FAISS REQUIRED)
+endif(BUILD_STATIC_FAISS)
+
 ##############################################################################
 # - googletest ---------------------------------------------------------------
 
@@ -65,4 +105,6 @@ endif(BUILD_GTEST)
 
 if(NOT CUB_IS_PART_OF_CTK)
   add_dependencies(GTest::GTest cub)
-endif(NOT CUB_IS_PART_OF_CTK) 
+endif(NOT CUB_IS_PART_OF_CTK)
+add_dependencies(FAISS::FAISS benchmark)
+add_dependencies(FAISS::FAISS faiss)
diff --git a/cpp/cmake/faiss_cuda11.patch b/cpp/cmake/faiss_cuda11.patch
new file mode 100644
index 0000000000..496ca0e7b2
--- /dev/null
+++ b/cpp/cmake/faiss_cuda11.patch
@@ -0,0 +1,40 @@
+diff --git a/configure b/configure
+index ed40dae..f88ed0a 100755
+--- a/configure
++++ b/configure
+@@ -2970,7 +2970,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
+ ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+-  ax_cxx_compile_alternatives="11 0x"    ax_cxx_compile_cxx11_required=true
++  ax_cxx_compile_alternatives="14 11 0x"    ax_cxx_compile_cxx11_required=true
+   ac_ext=cpp
+ ac_cpp='$CXXCPP $CPPFLAGS'
+ ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
+index 89d3dda..bc0f9b5 100644
+--- a/gpu/utils/DeviceDefs.cuh
++++ b/gpu/utils/DeviceDefs.cuh
+@@ -13,7 +13,7 @@
+ namespace faiss { namespace gpu {
+
+ #ifdef __CUDA_ARCH__
+-#if __CUDA_ARCH__ <= 750
++#if __CUDA_ARCH__ <= 800
+ constexpr int kWarpSize = 32;
+ #else
+ #error Unknown __CUDA_ARCH__; please define parameters for compute capability
+diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh
+index ede225e..4f7eb44 100644
+--- a/gpu/utils/MatrixMult-inl.cuh
++++ b/gpu/utils/MatrixMult-inl.cuh
+@@ -51,6 +51,9 @@ rawGemm(cublasHandle_t handle,
+   auto cBT = GetCudaType<BT>::Type;
+
+   // Always accumulate in f32
++# if __CUDACC_VER_MAJOR__ >= 11
++  cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
++# endif
+   return cublasSgemmEx(handle, transa, transb, m, n, k,
+                        &fAlpha, A, cAT, lda,
+                        B, cBT, ldb,
diff --git a/cpp/cmake/templates/CMakeLists.txt b/cpp/cmake/templates/CMakeLists.txt
new file mode 100644
index 0000000000..5bf20c9061
--- /dev/null
+++ b/cpp/cmake/templates/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# File to generate Find*.cmake modules to find packages.
+# This file uses Findpkd.cmake.in to generate modules so that we can use
+# find(package) in the main CMake for any package we want.
+#
+# To use, include in CMakeLists.txt this folder, and then use the function
+# GENERATE_FIND_MODULE, which takes the following parameters:
+#       NAME:
+#           Name of the package to find in find(NAME)
+#       HEADER_NAME:
+#           Name of header file to use to find include dirs path of the package.
+#       LIBRARY_NAME:
+#           (Optional) Name of library to find to find include lib path of the package.
+#           Assumed to be lib${NAME} if left empty.
+#       LOCATION:
+#           (Optional) Name of additional folder to look for headers/lib files.
+#           Useful if one wants to reduce size of #include commands
+#       VERSION:
+#           (Optional) Version of the package. Useful to find libraries that append
+#           version number to their filename (for example libopenblas.so.0)
+
+
+function(GENERATE_FIND_MODULE)
+    set(oneValueArgs NAME LOCATION VERSION)
+    set(multiValueArgs HEADER_NAME LIBRARY_NAME)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    if(NOT DEFINED PKG_LIBRARY_NAME)
+        set(PKG_LIBRARY_NAME ${PKG_NAME})
+    endif(NOT DEFINED PKG_LIBRARY_NAME)
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/Findpkg.cmake.in
+               ${CMAKE_CURRENT_BINARY_DIR}/cmake/Find${PKG_NAME}.cmake
+               @ONLY)
+endfunction()
+
diff --git a/cpp/cmake/templates/Findpkg.cmake.in b/cpp/cmake/templates/Findpkg.cmake.in
new file mode 100644
index 0000000000..a81f167d62
--- /dev/null
+++ b/cpp/cmake/templates/Findpkg.cmake.in
@@ -0,0 +1,98 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Based on FindPNG.cmake from cmake 3.14.3
+
+#[=======================================================================[.rst:
+Find@PKG_NAME@
+--------
+
+Template to generate FindPKG_NAME.cmake CMake modules
+
+Find @PKG_NAME@
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+This module defines the following :prop_tgt:`IMPORTED` target:
+
+``@PKG_NAME@::@PKG_NAME@``
+  The lib@PKG_NAME@ library, if found.
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables in your project:
+
+``@PKG_NAME@_INCLUDE_DIRS``
+  where to find @PKG_NAME@.hpp , etc.
+``@PKG_NAME@_LIBRARIES``
+  the libraries to link against to use lib@PKG_NAME@.
+``@PKG_NAME@_FOUND``
+  If false, do not try to use @PKG_NAME@.
+``@PKG_NAME@_VERSION_STRING``
+  the version of the @PKG_NAME@ library found
+
+#]=======================================================================]
+
+find_path(@PKG_NAME@_LOCATION @PKG_HEADER_NAME@
+          HINTS ${@PKG_NAME@_INSTALL_DIR}
+          PATH_SUFFIXES include include/@PKG_LOCATION@)
+
+list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@ lib@PKG_LIBRARY_NAME@)
+set(_@PKG_NAME@_VERSION_SUFFIXES @PKG_VERSION@)
+
+foreach(v IN LISTS _@PKG_NAME@_VERSION_SUFFIXES)
+  list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@${v} lib@PKG_LIBRARY_NAME@${v})
+  list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@.${v} lib@PKG_LIBRARY_NAME@.${v})
+endforeach()
+unset(_@PKG_NAME@_VERSION_SUFFIXES)
+
+find_library(@PKG_NAME@_LIBRARY_RELEASE NAMES ${@PKG_NAME@_NAMES}
+             HINTS ${@PKG_NAME@_INSTALL_DIR}
+             PATH_SUFFIXES lib)
+
+include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
+select_library_configurations(@PKG_NAME@)
+mark_as_advanced(@PKG_NAME@_LIBRARY_RELEASE)
+unset(@PKG_NAME@_NAMES)
+
+# Set by select_library_configurations(), but we want the one from
+# find_package_handle_standard_args() below.
+unset(@PKG_NAME@_FOUND)
+
+if (@PKG_NAME@_LIBRARY AND @PKG_NAME@_LOCATION)
+  set(@PKG_NAME@_INCLUDE_DIRS ${@PKG_NAME@_LOCATION} )
+  set(@PKG_NAME@_LIBRARY ${@PKG_NAME@_LIBRARY})
+
+  if(NOT TARGET @PKG_NAME@::@PKG_NAME@)
+    add_library(@PKG_NAME@::@PKG_NAME@ UNKNOWN IMPORTED)
+    set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${@PKG_NAME@_INCLUDE_DIRS}")
+    if(EXISTS "${@PKG_NAME@_LIBRARY}")
+      set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${@PKG_NAME@_LIBRARY}")
+    endif()
+  endif()
+endif ()
+
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(@PKG_NAME@
+                                  REQUIRED_VARS @PKG_NAME@_LIBRARY @PKG_NAME@_LOCATION
+                                  VERSION_VAR @PKG_NAME@_VERSION_STRING)
+
+mark_as_advanced(@PKG_NAME@_LOCATION @PKG_NAME@_LIBRARY)
diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
new file mode 100644
index 0000000000..16b295b3ca
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -0,0 +1,163 @@
+#pragma once
+
+#include "utils.hpp"
+
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/gpu/utils/Limits.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <raft/handle.hpp>
+#include <iostream>
+#include <set>
+
+namespace raft {
+namespace knn {
+namespace detail {
+
+template <typename value_idx = int64_t, typename value_t = float, int warp_q,
+          int thread_q, int tpb>
+__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
+                                       value_t *outK, value_idx *outV,
+                                       size_t n_samples, int n_parts,
+                                       value_t initK, value_idx initV, int k,
+                                       value_idx *translations) {
+  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+
+  __shared__ value_t smemK[kNumWarps * warp_q];
+  __shared__ value_idx smemV[kNumWarps * warp_q];
+
+  /**
+   * Uses shared memory
+   */
+  faiss::gpu::BlockSelect<value_t, value_idx, false,
+                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
+                          tpb>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+  int total_k = k * n_parts;
+
+  int i = threadIdx.x;
+
+  // Get starting pointers for cols in current thread
+  int part = i / k;
+  size_t row_idx = (row * k) + (part * n_samples * k);
+
+  int col = i % k;
+
+  value_t *inKStart = inK + (row_idx + col);
+  value_idx *inVStart = inV + (row_idx + col);
+
+  int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  value_idx translation = 0;
+
+  for (; i < limit; i += tpb) {
+    translation = translations[part];
+    heap.add(*inKStart, (*inVStart) + translation);
+
+    part = (i + tpb) / k;
+    row_idx = (row * k) + (part * n_samples * k);
+
+    col = (i + tpb) % k;
+
+    inKStart = inK + (row_idx + col);
+    inVStart = inV + (row_idx + col);
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < total_k) {
+    translation = translations[part];
+    heap.addThreadQ(*inKStart, (*inVStart) + translation);
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += tpb) {
+    outK[row * k + i] = smemK[i];
+    outV[row * k + i] = smemV[i];
+  }
+}
+
+template <typename value_idx = int64_t, typename value_t = float, int warp_q,
+          int thread_q>
+inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
+                                 value_idx *outV, size_t n_samples, int n_parts,
+                                 int k, cudaStream_t stream,
+                                 value_idx *translations) {
+  auto grid = dim3(n_samples);
+
+  constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
+  auto block = dim3(n_threads);
+
+  auto kInit = faiss::gpu::Limits<value_t>::getMax();
+  auto vInit = -1;
+  knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
+    <<<grid, block, 0, stream>>>(inK, inV, outK, outV, n_samples, n_parts,
+                                 kInit, vInit, k, translations);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Merge knn distances and index matrix, which have been partitioned
+ * by row, into a single matrix with only the k-nearest neighbors.
+ *
+ * @param inK partitioned knn distance matrix
+ * @param inV partitioned knn index matrix
+ * @param outK merged knn distance matrix
+ * @param outV merged knn index matrix
+ * @param n_samples number of samples per partition
+ * @param n_parts number of partitions
+ * @param k number of neighbors per partition (also number of merged neighbors)
+ * @param stream CUDA stream to use
+ * @param translations mapping of index offsets for each partition
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
+                            value_idx *outV, size_t n_samples, int n_parts,
+                            int k, cudaStream_t stream,
+                            value_idx *translations) {
+  if (k == 1)
+    knn_merge_parts_impl<value_idx, value_t, 1, 1>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 32)
+    knn_merge_parts_impl<value_idx, value_t, 32, 2>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 64)
+    knn_merge_parts_impl<value_idx, value_t, 64, 3>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 128)
+    knn_merge_parts_impl<value_idx, value_t, 128, 3>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 256)
+    knn_merge_parts_impl<value_idx, value_t, 256, 4>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 512)
+    knn_merge_parts_impl<value_idx, value_t, 512, 8>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else if (k <= 1024)
+    knn_merge_parts_impl<value_idx, value_t, 1024, 8>(
+      inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+}
+
+inline faiss::MetricType build_faiss_metric(MetricType metric) {
+  switch (metric) {
+    case MetricType::METRIC_Cosine:
+      return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case MetricType::METRIC_Correlation:
+      return faiss::MetricType::METRIC_INNER_PRODUCT;
+    default:
+      return (faiss::MetricType)metric;
+  }
+}
+
+} // namespace detail
+} // namespace knn
+} // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/utils.hpp b/cpp/include/raft/spatial/knn/detail/utils.hpp
new file mode 100644
index 0000000000..c8542772f7
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/utils.hpp
@@ -0,0 +1,39 @@
+
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft {
+namespace knn {
+
+  enum MetricType {
+    METRIC_INNER_PRODUCT = 0,
+    METRIC_L2,
+    METRIC_L1,
+    METRIC_Linf,
+    METRIC_Lp,
+
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+
+    METRIC_Cosine = 100,
+    METRIC_Correlation
+  };
+
+}  // namespace knn
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
new file mode 100644
index 0000000000..b37d99f82a
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/brute_force_knn.hpp"
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+  namespace knn {
+
+/**
+ * Search the kNN for the k-nearest neighbors of a set of query vectors
+ * @param[in] input vector of device device memory array pointers to search
+ * @param[in] sizes vector of memory sizes for each device array pointer in input
+ * @param[in] D number of cols in input and search_items
+ * @param[in] search_items set of vectors to query for neighbors
+ * @param[in] n        number of items in search_items
+ * @param[out] res_I    pointer to device memory for returning k nearest indices
+ * @param[out] res_D    pointer to device memory for returning k nearest distances
+ * @param[in] k        number of neighbors to query
+ * @param[in] allocator the device memory allocator to use for temporary scratch memory
+ * @param[in] userStream the main cuda stream to use
+ * @param[in] internalStreams optional when n_params > 0, the index partitions can be
+ *        queried in parallel using these streams. Note that n_int_streams also
+ *        has to be > 0 for these to be used and their cardinality does not need
+ *        to correspond to n_parts.
+ * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the
+ *        user stream will be used.
+ * @param[in] rowMajorIndex are the index arrays in row-major layout?
+ * @param[in] rowMajorQuery are the query array in row-major layout?
+ * @param[in] translations translation ids for indices when index rows represent
+ *        non-contiguous partitions
+ * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean)
+ * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
+ * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root
+ */
+template <typename IntType = int>
+void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
+													IntType D, float *search_items, IntType n, int64_t *res_I,
+													float *res_D, IntType k,
+													std::shared_ptr<raft::mr::device::allocator> allocator,
+													cudaStream_t userStream,
+													cudaStream_t *internalStreams = nullptr,
+													int n_int_streams = 0, bool rowMajorIndex = true,
+													bool rowMajorQuery = true,
+													std::vector<int64_t> *translations = nullptr,
+													MetricType metric = MetricType::METRIC_L2,
+													float metricArg = 2.0, bool expanded_form = false) {
+
+ ASSERT(input.size() == sizes.size(),
+         "input and sizes vectors should be the same size");
+
+  faiss::MetricType m = detail::build_faiss_metric(metric);
+
+  std::vector<int64_t> *id_ranges;
+  if (translations == nullptr) {
+    // If we don't have explicit translations
+    // for offsets of the indices, build them
+    // from the local partitions
+    id_ranges = new std::vector<int64_t>();
+    int64_t total_n = 0;
+    for (int i = 0; i < input.size(); i++) {
+      id_ranges->push_back(total_n);
+      total_n += sizes[i];
+    }
+  } else {
+    // otherwise, use the given translations
+    id_ranges = translations;
+  }
+
+  // perform preprocessing
+  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
+                            allocator);
+  query_metric_processor->preprocess(search_items);
+
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
+    input.size());
+  for (int i = 0; i < input.size(); i++) {
+    metric_processors[i] = create_processor<float>(
+      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i]->preprocess(input[i]);
+  }
+
+  int device;
+  CUDA_CHECK(cudaGetDevice(&device));
+
+  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
+                      userStream);
+
+  raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
+  raft::mr::devic::buffer<int64_t> all_I(allocator, userStream, 0);
+
+  float *out_D = res_D;
+  int64_t *out_I = res_I;
+
+  if (input.size() > 1) {
+    all_D.resize(input.size() * k * n, userStream);
+    all_I.resize(input.size() * k * n, userStream);
+
+    out_D = all_D.data();
+    out_I = all_I.data();
+  }
+
+  // Sync user stream only if using other streams to parallelize query
+  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
+
+  for (int i = 0; i < input.size(); i++) {
+    faiss::gpu::StandardGpuResources gpu_res;
+
+    cudaStream_t stream =
+      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+
+    gpu_res.noTempMemory();
+    gpu_res.setCudaMallocWarning(false);
+    gpu_res.setDefaultStream(device, stream);
+
+    faiss::gpu::GpuDistanceParams args;
+    args.metric = m;
+    args.metricArg = metricArg;
+    args.k = k;
+    args.dims = D;
+    args.vectors = input[i];
+    args.vectorsRowMajor = rowMajorIndex;
+    args.numVectors = sizes[i];
+    args.queries = search_items;
+    args.queriesRowMajor = rowMajorQuery;
+    args.numQueries = n;
+    args.outDistances = out_D + (i * k * n);
+    args.outIndices = out_I + (i * k * n);
+
+    /**
+     * @todo: Until FAISS supports pluggable allocation strategies,
+     * we will not reap the benefits of the pool allocator for
+     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+     */
+    bfKnn(&gpu_res, args);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
+
+  // Sync internal streams if used. We don't need to
+  // sync the user stream because we'll already have
+  // fully serial execution.
+  for (int i = 0; i < n_int_streams; i++) {
+    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+  }
+
+  if (input.size() > 1 || translations != nullptr) {
+    // This is necessary for proper index translations. If there are
+    // no translations or partitions to combine, it can be skipped.
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
+                    trans.data());
+  }
+
+  // Perform necessary post-processing
+  if ((m == faiss::MetricType::METRIC_L2 ||
+       m == faiss::MetricType::METRIC_Lp) &&
+      !expanded_form) {
+    /**
+	* post-processing
+	*/
+    float p = 0.5;  // standard l2
+    if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg;
+    raft::linalg::unaryOp<float>(
+      res_D, res_D, n * k,
+      [p] __device__(float input) { return powf(input, p); }, userStream);
+  }
+
+  query_metric_processor->revert(search_items);
+  query_metric_processor->postprocess(out_D);
+  for (int i = 0; i < input.size(); i++) {
+    metric_processors[i]->revert(input[i]);
+  }
+
+  if (translations == nullptr) delete id_ranges;
+}
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances.
+ *
+ * @param[in] handle the cuml handle to use
+ * @param[in] input vector of pointers to the input arrays
+ * @param[in] sizes vector of sizes of input arrays
+ * @param[in] D the dimensionality of the arrays
+ * @param[in] search_items array of items to search of dimensionality D
+ * @param[in] n number of rows in search_items
+ * @param[out] res_I the resulting index array of size n * k
+ * @param[out] res_D the resulting distance array of size n * k
+ * @param[in] k the number of nearest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major order?
+ * @param[in] rowMajorQuery are the query arrays in row-major order?
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by
+ * 			   default
+ * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] expanded should lp-based distances be returned in their expanded
+ * 					 form (e.g., without raising to the 1/p power).
+ */
+void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
+                     std::vector<int> &sizes, int D, float *search_items, int n,
+                     int64_t *res_I, float *res_D, int k, bool rowMajorIndex,
+                     bool rowMajorQuery, MetricType metric, float metric_arg,
+                     bool expanded) {
+  ASSERT(input.size() == sizes.size(),
+         "input and sizes vectors must be the same size");
+
+  std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
+
+  brute_force_knn_impl(
+    input, sizes, D, search_items, n, res_I, res_D, k,
+    handle.get_device_allocator(), handle.get_stream(), int_streams.data(),
+    handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr,
+    metric, metric_arg, expanded);
+}
+
+} // namespace knn
+} // namespace raft
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
new file mode 100644
index 0000000000..85780cfcf4
--- /dev/null
+++ b/cpp/test/spatial/knn.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include <raft/spatial/knn/knn.hpp>
+
+namespace raft {
+
+  struct KNNInputs {
+    std::vector<std::vector<float>> input;
+    int k;
+  };
+
+template <typename T>
+class KNNTest : public ::testing::TestWithParam<KNNInputs> {
+ protected:
+  void testBruteForce() {
+
+		brute_force_knn(handle_,
+										input_,
+                    sizes_,
+                    cols_,
+                    search_data_,
+                    rows_,
+                    indices_,
+                    distances_,
+                    k_,
+                    true,
+                    true);
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<KNNInputs>::GetParam();
+    rows_ = params.input.size();
+    cols_ = params.input[0].size();
+    k_ = params.k;
+
+    float *input_d = rmm::device_buffer(params.input.data(),
+                                        params.input.size() * sizeof(float));
+
+    input_.push_back(input_d);
+    sizes_.push_back(rows_);
+
+    raft::allocate(search_data_, row_ * cols_, true);
+    raft::allocate(indices_,
+                   rows_ * cols_,
+                   true);
+    raft::allocate(distances_,
+                   rows_ * cols_,
+                   true);
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(search_data));
+    CUDA_CHECK(cudaFree(search_labels));
+    CUDA_CHECK(cudaFree(output_dists));
+    CUDA_CHECK(cudaFree(output_indices));
+    CUDA_CHECK(cudaFree(actual_labels));
+    CUDA_CHECK(cudaFree(expected_labels));
+  }
+
+ private:
+  raft::handle_t handle_;
+  KNNInputs params_;
+  int rows_;
+  int cols_;
+  std::vector<float *> input_;
+  std::vector<int> sizes_;
+  float *search_data_;
+  int64_t indices_;
+  float* distances_;
+  int k_;
+};
+
+
+const std::vector<KNNInputs> inputs = {
+  // 2D
+  {
+    {
+      { 7.89611  ,  -6.3093657 },
+      { 8.198494 ,  -6.6102095 },
+      {-1.067701 ,   0.2757877 },
+      { 5.5629272,  -4.0279684 },
+      { 8.466168 ,  -6.3818727 },
+      { 7.373038 ,  -3.2476108 },
+      { 7.3618903,  -6.311329  },
+      { 3.5585778,   2.3175476 },
+      { 8.722544 ,  -6.184722  },
+      { 5.9165254,  -4.0085735 },
+      {-2.4502695,   1.8806121 },
+      { 1.250205 ,   1.6940732 },
+      { 7.702861 ,  -5.5382366 },
+      {-0.32521492,  1.0503006 },
+      { 7.203165 ,  -6.1078873 },
+      { 0.7067232,  -0.02844107},
+      {-0.6195269,   1.6659582 },
+      { 7.3585844,  -6.5425425 },
+      { 0.2946735,   0.7920021 },
+      { 5.9978905,  -4.235259  }},
+    2},
+};
+
+typedef KNNTest<float> KNNTestF;
+TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); }
+
+INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs));
+
+} // namespace raft

From c775437d3efb597bfc1c47034a914d2c59562f4b Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Thu, 7 Jan 2021 11:43:08 -0600
Subject: [PATCH 06/19] Fix compile errors

---
 .../spatial/knn/detail/brute_force_knn.hpp    |   1 +
 .../raft/spatial/knn/detail/processing.hpp    | 191 ++++++++++++++++++
 cpp/include/raft/spatial/knn/knn.hpp          |   7 +-
 3 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/detail/processing.hpp

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index 16b295b3ca..29eccfd43b 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -10,6 +10,7 @@
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
new file mode 100644
index 0000000000..57c7e1050c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c)2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "utils.hpp"
+
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace knn {
+
+using deviceAllocator = raft::mr::device::allocator;
+/**
+ * @brief A virtual class defining pre- and post-processing
+ * for metrics. This class will temporarily modify its given
+ * state in `preprocess()` and undo those modifications in
+ * `postprocess()`
+ */
+
+template <typename math_t>
+class MetricProcessor {
+ public:
+  virtual void preprocess(math_t *data) {}
+
+  virtual void revert(math_t *data) {}
+
+  virtual void postprocess(math_t *data) {}
+
+  virtual ~MetricProcessor() = default;
+};
+
+template <typename math_t>
+class CosineMetricProcessor : public MetricProcessor<math_t> {
+ protected:
+  int k_;
+  bool row_major_;
+  size_t n_rows_;
+  size_t n_cols_;
+  cudaStream_t stream_;
+  std::shared_ptr<deviceAllocator> device_allocator_;
+  raft::mr::device::buffer<math_t> colsums_;
+
+ public:
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
+                        cudaStream_t stream,
+                        std::shared_ptr<deviceAllocator> allocator)
+    : device_allocator_(allocator),
+      stream_(stream),
+      colsums_(allocator, stream, n_rows),
+      n_cols_(n_cols),
+      n_rows_(n_rows),
+      row_major_(row_major),
+      k_(k) {}
+
+  void preprocess(math_t *data) {
+    raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_,
+                          raft::linalg::NormType::L2Norm, row_major_, stream_,
+                          [] __device__(math_t in) { return sqrtf(in); });
+
+    raft::linalg::matrixVectorOp(
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
+      stream_);
+  }
+
+  void revert(math_t *data) {
+    raft::linalg::matrixVectorOp(
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
+      stream_);
+  }
+
+  void postprocess(math_t *data) {
+    raft::linalg::unaryOp(
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; },
+      stream_);
+  }
+
+  ~CosineMetricProcessor() = default;
+};
+
+template <typename math_t>
+class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
+  using cosine = CosineMetricProcessor<math_t>;
+
+ public:
+  CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
+                             bool row_major, cudaStream_t stream,
+                             std::shared_ptr<deviceAllocator> allocator)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
+                                    allocator),
+      means_(allocator, stream, n_rows) {}
+
+  void preprocess(math_t *data) {
+    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
+
+    raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_,
+                         (math_t)0.0, cosine::row_major_, true,
+                         cosine::stream_);
+
+    raft::linalg::unaryOp(
+      means_.data(), means_.data(), cosine::n_rows_,
+      [=] __device__(math_t in) { return in * normalizer_const; },
+      cosine::stream_);
+
+    raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_,
+                            cosine::n_rows_, cosine::row_major_, false,
+                            cosine::stream_);
+
+    CosineMetricProcessor<math_t>::preprocess(data);
+  }
+
+  void revert(math_t *data) {
+    CosineMetricProcessor<math_t>::revert(data);
+
+    raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_,
+                         cosine::n_rows_, cosine::row_major_, false,
+                         cosine::stream_);
+  }
+
+  void postprocess(math_t *data) {
+    CosineMetricProcessor<math_t>::postprocess(data);
+  }
+
+  ~CorrelationMetricProcessor() = default;
+
+  raft::mr::device::buffer<math_t> means_;
+};
+
+template <typename math_t>
+class DefaultMetricProcessor : public MetricProcessor<math_t> {
+ public:
+  void preprocess(math_t *data) {}
+
+  void revert(math_t *data) {}
+
+  void postprocess(math_t *data) {}
+
+  ~DefaultMetricProcessor() = default;
+};
+
+template <typename math_t>
+inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
+  MetricType metric, int n, int D, int k, bool rowMajorQuery,
+  cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
+  MetricProcessor<math_t> *mp = nullptr;
+
+  switch (metric) {
+    case MetricType::METRIC_Cosine:
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
+                                             allocator);
+      break;
+
+    case MetricType::METRIC_Correlation:
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
+                                                  userStream, allocator);
+      break;
+    default:
+      mp = new DefaultMetricProcessor<math_t>();
+  }
+
+  return std::unique_ptr<MetricProcessor<math_t>>(mp);
+}
+
+// Currently only being used by floats
+template class MetricProcessor<float>;
+template class CosineMetricProcessor<float>;
+template class CorrelationMetricProcessor<float>;
+template class DefaultMetricProcessor<float>;
+
+};  // namespace knn
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index b37d99f82a..9abafe6681 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "detail/brute_force_knn.hpp"
+#include "detail/processing.hpp"
 
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
@@ -24,6 +25,8 @@
 namespace raft {
   namespace knn {
 
+using deviceAllocator = raft::mr::device::allocator;
+
 /**
  * Search the kNN for the k-nearest neighbors of a set of query vectors
  * @param[in] input vector of device device memory array pointers to search
@@ -106,7 +109,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
                       userStream);
 
   raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
-  raft::mr::devic::buffer<int64_t> all_I(allocator, userStream, 0);
+  raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
 
   float *out_D = res_D;
   int64_t *out_I = res_I;
@@ -166,7 +169,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
+    detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
                     trans.data());
   }
 

From a22fddb1573ea3189e9c0823dde08f95f43ebbe7 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Fri, 8 Jan 2021 19:06:39 -0600
Subject: [PATCH 07/19] Add knn test

---
 cpp/CMakeLists.txt                            |  78 ++++----
 .../spatial/knn/detail/brute_force_knn.hpp    | 170 +++++++++++++++++
 cpp/include/raft/spatial/knn/knn.hpp          | 172 +-----------------
 cpp/test/spatial/knn.cu                       | 171 +++++++++++------
 4 files changed, 329 insertions(+), 262 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 691fbfdfd4..82d7c3d2f0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -248,45 +248,45 @@ if(BUILD_RAFT_TESTS)
 
   # keep the files in alphabetical order!
   add_executable(test_raft
-    test/cudart_utils.cpp
-    test/handle.cpp
-    test/integer_utils.cpp
-    test/lap/lap.cu
-    test/linalg/add.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/reduce.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/mr/device/buffer.cpp
-    test/mr/host/buffer.cpp
-    test/random/rng.cu
-    test/random/rng_int.cu
-    test/random/sample_without_replacement.cu
-    test/spatial/knn.cu
-    test/stats/mean.cu
-    test/stats/mean_center.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/test.cpp
-    test/spectral_matrix.cu
-    test/eigen_solvers.cu
-    test/cluster_solvers.cu
-    test/mst.cu)
+    #  test/cudart_utils.cpp
+    #test/handle.cpp
+    #test/integer_utils.cpp
+    #test/lap/lap.cu
+    #test/linalg/add.cu
+    #test/linalg/binary_op.cu
+    #test/linalg/cholesky_r1.cu
+    #test/linalg/coalesced_reduction.cu
+    #test/linalg/divide.cu
+    #test/linalg/eig.cu
+    #test/linalg/eig_sel.cu
+    #test/linalg/gemm_layout.cu
+    #test/linalg/map_then_reduce.cu
+    #test/linalg/matrix_vector_op.cu
+    #test/linalg/multiply.cu
+    #test/linalg/norm.cu
+    #test/linalg/reduce.cu
+    #test/linalg/strided_reduction.cu
+    #test/linalg/subtract.cu
+    #test/linalg/svd.cu
+    #test/linalg/transpose.cu
+    #test/linalg/unary_op.cu
+    #test/matrix/math.cu
+    #test/matrix/matrix.cu
+    #test/mr/device/buffer.cpp
+    #test/mr/host/buffer.cpp
+    #test/random/rng.cu
+    #test/random/rng_int.cu
+    #test/random/sample_without_replacement.cu
+    test/spatial/knn.cu)
+    #test/stats/mean.cu
+    #test/stats/mean_center.cu
+    #test/stats/stddev.cu
+    #test/stats/sum.cu
+    #test/test.cpp
+    #test/spectral_matrix.cu
+    #test/eigen_solvers.cu
+    #test/cluster_solvers.cu
+    #test/mst.cu)
 
   target_include_directories(test_raft
     PRIVATE
diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index 29eccfd43b..432c3b2cf5 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "processing.hpp"
 #include "utils.hpp"
 
 #include <raft/cudart_utils.h>
@@ -159,6 +160,175 @@ inline faiss::MetricType build_faiss_metric(MetricType metric) {
   }
 }
 
+/**
+ * Search the kNN for the k-nearest neighbors of a set of query vectors
+ * @param[in] input vector of device device memory array pointers to search
+ * @param[in] sizes vector of memory sizes for each device array pointer in input
+ * @param[in] D number of cols in input and search_items
+ * @param[in] search_items set of vectors to query for neighbors
+ * @param[in] n        number of items in search_items
+ * @param[out] res_I    pointer to device memory for returning k nearest indices
+ * @param[out] res_D    pointer to device memory for returning k nearest distances
+ * @param[in] k        number of neighbors to query
+ * @param[in] allocator the device memory allocator to use for temporary scratch memory
+ * @param[in] userStream the main cuda stream to use
+ * @param[in] internalStreams optional when n_params > 0, the index partitions can be
+ *        queried in parallel using these streams. Note that n_int_streams also
+ *        has to be > 0 for these to be used and their cardinality does not need
+ *        to correspond to n_parts.
+ * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the
+ *        user stream will be used.
+ * @param[in] rowMajorIndex are the index arrays in row-major layout?
+ * @param[in] rowMajorQuery are the query array in row-major layout?
+ * @param[in] translations translation ids for indices when index rows represent
+ *        non-contiguous partitions
+ * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean)
+ * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
+ * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root
+ */
+template <typename IntType = int>
+void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
+													IntType D, float *search_items, IntType n, int64_t *res_I,
+													float *res_D, IntType k,
+													std::shared_ptr<raft::mr::device::allocator> allocator,
+													cudaStream_t userStream,
+													cudaStream_t *internalStreams = nullptr,
+													int n_int_streams = 0, bool rowMajorIndex = true,
+													bool rowMajorQuery = true,
+													std::vector<int64_t> *translations = nullptr,
+													MetricType metric = MetricType::METRIC_L2,
+													float metricArg = 2.0, bool expanded_form = false) {
+
+ ASSERT(input.size() == sizes.size(),
+         "input and sizes vectors should be the same size");
+
+  faiss::MetricType m = detail::build_faiss_metric(metric);
+
+  std::vector<int64_t> *id_ranges;
+  if (translations == nullptr) {
+    // If we don't have explicit translations
+    // for offsets of the indices, build them
+    // from the local partitions
+    id_ranges = new std::vector<int64_t>();
+    int64_t total_n = 0;
+    for (int i = 0; i < input.size(); i++) {
+      id_ranges->push_back(total_n);
+      total_n += sizes[i];
+    }
+  } else {
+    // otherwise, use the given translations
+    id_ranges = translations;
+  }
+
+  // perform preprocessing
+  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
+                            allocator);
+  query_metric_processor->preprocess(search_items);
+
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
+    input.size());
+  for (int i = 0; i < input.size(); i++) {
+    metric_processors[i] = create_processor<float>(
+      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i]->preprocess(input[i]);
+  }
+
+  int device;
+  CUDA_CHECK(cudaGetDevice(&device));
+
+  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
+                      userStream);
+
+  raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
+  raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
+
+  float *out_D = res_D;
+  int64_t *out_I = res_I;
+
+  if (input.size() > 1) {
+    all_D.resize(input.size() * k * n, userStream);
+    all_I.resize(input.size() * k * n, userStream);
+
+    out_D = all_D.data();
+    out_I = all_I.data();
+  }
+
+  // Sync user stream only if using other streams to parallelize query
+  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
+
+  for (int i = 0; i < input.size(); i++) {
+    faiss::gpu::StandardGpuResources gpu_res;
+
+    cudaStream_t stream =
+      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+
+    gpu_res.noTempMemory();
+    gpu_res.setCudaMallocWarning(false);
+    gpu_res.setDefaultStream(device, stream);
+
+    faiss::gpu::GpuDistanceParams args;
+    args.metric = m;
+    args.metricArg = metricArg;
+    args.k = k;
+    args.dims = D;
+    args.vectors = input[i];
+    args.vectorsRowMajor = rowMajorIndex;
+    args.numVectors = sizes[i];
+    args.queries = search_items;
+    args.queriesRowMajor = rowMajorQuery;
+    args.numQueries = n;
+    args.outDistances = out_D + (i * k * n);
+    args.outIndices = out_I + (i * k * n);
+
+    /**
+     * @todo: Until FAISS supports pluggable allocation strategies,
+     * we will not reap the benefits of the pool allocator for
+     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+     */
+    bfKnn(&gpu_res, args);
+
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
+
+  // Sync internal streams if used. We don't need to
+  // sync the user stream because we'll already have
+  // fully serial execution.
+  for (int i = 0; i < n_int_streams; i++) {
+    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+  }
+
+  if (input.size() > 1 || translations != nullptr) {
+    // This is necessary for proper index translations. If there are
+    // no translations or partitions to combine, it can be skipped.
+    detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
+                    trans.data());
+  }
+
+  // Perform necessary post-processing
+  if ((m == faiss::MetricType::METRIC_L2 ||
+       m == faiss::MetricType::METRIC_Lp) &&
+      !expanded_form) {
+    /**
+	* post-processing
+	*/
+    float p = 0.5;  // standard l2
+    if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg;
+    raft::linalg::unaryOp<float>(
+      res_D, res_D, n * k,
+      [p] __device__(float input) { return powf(input, p); }, userStream);
+  }
+
+  query_metric_processor->revert(search_items);
+  query_metric_processor->postprocess(out_D);
+  for (int i = 0; i < input.size(); i++) {
+    metric_processors[i]->revert(input[i]);
+  }
+
+  if (translations == nullptr) delete id_ranges;
+}
+
 } // namespace detail
 } // namespace knn
 } // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 9abafe6681..b6e3ed09ab 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "detail/brute_force_knn.hpp"
-#include "detail/processing.hpp"
 
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
@@ -27,175 +26,6 @@ namespace raft {
 
 using deviceAllocator = raft::mr::device::allocator;
 
-/**
- * Search the kNN for the k-nearest neighbors of a set of query vectors
- * @param[in] input vector of device device memory array pointers to search
- * @param[in] sizes vector of memory sizes for each device array pointer in input
- * @param[in] D number of cols in input and search_items
- * @param[in] search_items set of vectors to query for neighbors
- * @param[in] n        number of items in search_items
- * @param[out] res_I    pointer to device memory for returning k nearest indices
- * @param[out] res_D    pointer to device memory for returning k nearest distances
- * @param[in] k        number of neighbors to query
- * @param[in] allocator the device memory allocator to use for temporary scratch memory
- * @param[in] userStream the main cuda stream to use
- * @param[in] internalStreams optional when n_params > 0, the index partitions can be
- *        queried in parallel using these streams. Note that n_int_streams also
- *        has to be > 0 for these to be used and their cardinality does not need
- *        to correspond to n_parts.
- * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the
- *        user stream will be used.
- * @param[in] rowMajorIndex are the index arrays in row-major layout?
- * @param[in] rowMajorQuery are the query array in row-major layout?
- * @param[in] translations translation ids for indices when index rows represent
- *        non-contiguous partitions
- * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean)
- * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
- * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root
- */
-template <typename IntType = int>
-void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
-													IntType D, float *search_items, IntType n, int64_t *res_I,
-													float *res_D, IntType k,
-													std::shared_ptr<raft::mr::device::allocator> allocator,
-													cudaStream_t userStream,
-													cudaStream_t *internalStreams = nullptr,
-													int n_int_streams = 0, bool rowMajorIndex = true,
-													bool rowMajorQuery = true,
-													std::vector<int64_t> *translations = nullptr,
-													MetricType metric = MetricType::METRIC_L2,
-													float metricArg = 2.0, bool expanded_form = false) {
-
- ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors should be the same size");
-
-  faiss::MetricType m = detail::build_faiss_metric(metric);
-
-  std::vector<int64_t> *id_ranges;
-  if (translations == nullptr) {
-    // If we don't have explicit translations
-    // for offsets of the indices, build them
-    // from the local partitions
-    id_ranges = new std::vector<int64_t>();
-    int64_t total_n = 0;
-    for (int i = 0; i < input.size(); i++) {
-      id_ranges->push_back(total_n);
-      total_n += sizes[i];
-    }
-  } else {
-    // otherwise, use the given translations
-    id_ranges = translations;
-  }
-
-  // perform preprocessing
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
-                            allocator);
-  query_metric_processor->preprocess(search_items);
-
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
-    input.size());
-  for (int i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(
-      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
-    metric_processors[i]->preprocess(input[i]);
-  }
-
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
-                      userStream);
-
-  raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
-  raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
-
-  float *out_D = res_D;
-  int64_t *out_I = res_I;
-
-  if (input.size() > 1) {
-    all_D.resize(input.size() * k * n, userStream);
-    all_I.resize(input.size() * k * n, userStream);
-
-    out_D = all_D.data();
-    out_I = all_I.data();
-  }
-
-  // Sync user stream only if using other streams to parallelize query
-  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
-
-  for (int i = 0; i < input.size(); i++) {
-    faiss::gpu::StandardGpuResources gpu_res;
-
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
-
-    gpu_res.noTempMemory();
-    gpu_res.setCudaMallocWarning(false);
-    gpu_res.setDefaultStream(device, stream);
-
-    faiss::gpu::GpuDistanceParams args;
-    args.metric = m;
-    args.metricArg = metricArg;
-    args.k = k;
-    args.dims = D;
-    args.vectors = input[i];
-    args.vectorsRowMajor = rowMajorIndex;
-    args.numVectors = sizes[i];
-    args.queries = search_items;
-    args.queriesRowMajor = rowMajorQuery;
-    args.numQueries = n;
-    args.outDistances = out_D + (i * k * n);
-    args.outIndices = out_I + (i * k * n);
-
-    /**
-     * @todo: Until FAISS supports pluggable allocation strategies,
-     * we will not reap the benefits of the pool allocator for
-     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-     */
-    bfKnn(&gpu_res, args);
-
-    CUDA_CHECK(cudaPeekAtLastError());
-  }
-
-  // Sync internal streams if used. We don't need to
-  // sync the user stream because we'll already have
-  // fully serial execution.
-  for (int i = 0; i < n_int_streams; i++) {
-    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
-  }
-
-  if (input.size() > 1 || translations != nullptr) {
-    // This is necessary for proper index translations. If there are
-    // no translations or partitions to combine, it can be skipped.
-    detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
-  }
-
-  // Perform necessary post-processing
-  if ((m == faiss::MetricType::METRIC_L2 ||
-       m == faiss::MetricType::METRIC_Lp) &&
-      !expanded_form) {
-    /**
-	* post-processing
-	*/
-    float p = 0.5;  // standard l2
-    if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg;
-    raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k,
-      [p] __device__(float input) { return powf(input, p); }, userStream);
-  }
-
-  query_metric_processor->revert(search_items);
-  query_metric_processor->postprocess(out_D);
-  for (int i = 0; i < input.size(); i++) {
-    metric_processors[i]->revert(input[i]);
-  }
-
-  if (translations == nullptr) delete id_ranges;
-}
-
 /**
  * @brief Flat C++ API function to perform a brute force knn on
  * a series of input arrays and combine the results into a single
@@ -229,7 +59,7 @@ void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
 
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
-  brute_force_knn_impl(
+  detail::brute_force_knn_impl(
     input, sizes, D, search_items, n, res_I, res_D, k,
     handle.get_device_allocator(), handle.get_stream(), int_streams.data(),
     handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr,
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 85780cfcf4..bfacc4935e 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -17,23 +17,63 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <vector>
+#include <rmm/device_buffer.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 namespace raft {
-
+namespace knn {
   struct KNNInputs {
     std::vector<std::vector<float>> input;
     int k;
+    std::vector<int> labels;
   };
 
+__global__ void build_actual_output(int *output, int n_rows, int k,
+                                    const int *idx_labels,
+                                    const int64_t *indices) {
+  int element = threadIdx.x + blockDim.x * blockIdx.x;
+  if (element >= n_rows * k) return;
+
+  int ind = (int)indices[element];
+  output[element] = idx_labels[ind];
+}
+
+__global__ void build_expected_output(int *output, int n_rows, int k,
+                                      const int *labels) {
+  int row = threadIdx.x + blockDim.x * blockIdx.x;
+  if (row >= n_rows) return;
+
+  int cur_label = labels[row];
+  for (int i = 0; i < k; i++) {
+    output[row * k + i] = cur_label;
+  }
+}
+
 template <typename T>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
   void testBruteForce() {
 
+    raft::print_device_vector("Input array: ", input_,
+         rows_ * cols_, std::cout);
+    std::cout << "K: " << k_ << "\n";
+    raft::print_device_vector("Labels array: ", search_labels_,
+         rows_, std::cout);
+
+    auto stream = handle_.get_stream();
+
+    raft::allocate(actual_labels_, rows_ * k_, true);
+    raft::allocate(expected_labels_, rows_ * k_, true);
+
+    std::vector<float *> input_vec;
+    std::vector<int> sizes_vec;
+    input_vec.push_back(input_);
+    sizes_vec.push_back(rows_);
+
+
 		brute_force_knn(handle_,
-										input_,
-                    sizes_,
+										input_vec,
+                    sizes_vec,
                     cols_,
                     search_data_,
                     rows_,
@@ -41,37 +81,67 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
                     distances_,
                     k_,
                     true,
-                    true);
+                    true,
+                    MetricType::METRIC_L2,
+                    0.0,
+                    false);
+
+    build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
+        actual_labels_, rows_,
+        k_, search_labels_,
+        indices_);
+
+    build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
+        expected_labels_, rows_,
+        k_, search_labels_);
+
+    raft::print_device_vector("Output indices: ", indices_,
+         rows_ * k_, std::cout);
+    raft::print_device_vector("Output distances: ", distances_,
+         rows_ * k_, std::cout);
+    raft::print_device_vector("Output labels: ", actual_labels_,
+         rows_ * k_, std::cout);
+    raft::print_device_vector("Expected labels: ", expected_labels_,
+         rows_ * k_, std::cout);
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<KNNInputs>::GetParam();
-    rows_ = params.input.size();
-    cols_ = params.input[0].size();
-    k_ = params.k;
-
-    float *input_d = rmm::device_buffer(params.input.data(),
-                                        params.input.size() * sizeof(float));
-
-    input_.push_back(input_d);
-    sizes_.push_back(rows_);
-
-    raft::allocate(search_data_, row_ * cols_, true);
-    raft::allocate(indices_,
-                   rows_ * cols_,
-                   true);
-    raft::allocate(distances_,
-                   rows_ * cols_,
-                   true);
+    params_ = ::testing::TestWithParam<KNNInputs>::GetParam();
+    rows_ = params_.input.size();
+    cols_ = params_.input[0].size();
+    k_ = params_.k;
+
+    std::vector<float> row_major_input;
+    for (int i = 0; i < params_.input.size(); ++i) {
+      for (int j = 0; j < params_.input[i].size(); ++j) {
+        row_major_input.push_back(params_.input[i][j]);
+      }
+    }
+    rmm::device_buffer input_d = rmm::device_buffer(row_major_input.data(),
+        row_major_input.size() * sizeof(float));
+    float *input_ptr = static_cast<float *>(input_d.data());
+
+    rmm::device_buffer labels_d = rmm::device_buffer(params_.labels.data(),
+        params_.labels.size() * sizeof(int));
+    int *labels_ptr = static_cast<int *>(labels_d.data());
+
+    raft::allocate(input_, rows_ * cols_, true);
+    raft::allocate(search_data_, rows_ * cols_, true);
+    raft::allocate(search_data_, rows_ * cols_, true);
+    raft::allocate(indices_, rows_ * k_, true);
+    raft::allocate(distances_, rows_ * k_, true);
+    raft::allocate(search_labels_, rows_, true);
+
+    raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream());
+    raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream());
+    raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream());
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(search_data));
-    CUDA_CHECK(cudaFree(search_labels));
-    CUDA_CHECK(cudaFree(output_dists));
-    CUDA_CHECK(cudaFree(output_indices));
-    CUDA_CHECK(cudaFree(actual_labels));
-    CUDA_CHECK(cudaFree(expected_labels));
+    CUDA_CHECK(cudaFree(search_data_));
+    CUDA_CHECK(cudaFree(indices_));
+    CUDA_CHECK(cudaFree(distances_));
+    CUDA_CHECK(cudaFree(actual_labels_));
   }
 
  private:
@@ -79,12 +149,15 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   KNNInputs params_;
   int rows_;
   int cols_;
-  std::vector<float *> input_;
-  std::vector<int> sizes_;
+  float *input_;
   float *search_data_;
-  int64_t indices_;
+  int64_t *indices_;
   float* distances_;
   int k_;
+
+  int *search_labels_;
+  int *actual_labels_;
+  int *expected_labels_;
 };
 
 
@@ -92,27 +165,20 @@ const std::vector<KNNInputs> inputs = {
   // 2D
   {
     {
-      { 7.89611  ,  -6.3093657 },
-      { 8.198494 ,  -6.6102095 },
-      {-1.067701 ,   0.2757877 },
-      { 5.5629272,  -4.0279684 },
-      { 8.466168 ,  -6.3818727 },
-      { 7.373038 ,  -3.2476108 },
-      { 7.3618903,  -6.311329  },
-      { 3.5585778,   2.3175476 },
-      { 8.722544 ,  -6.184722  },
-      { 5.9165254,  -4.0085735 },
-      {-2.4502695,   1.8806121 },
-      { 1.250205 ,   1.6940732 },
-      { 7.702861 ,  -5.5382366 },
-      {-0.32521492,  1.0503006 },
-      { 7.203165 ,  -6.1078873 },
-      { 0.7067232,  -0.02844107},
-      {-0.6195269,   1.6659582 },
-      { 7.3585844,  -6.5425425 },
-      { 0.2946735,   0.7920021 },
-      { 5.9978905,  -4.235259  }},
-    2},
+      { 2.7810836,2.550537003 },
+      { 1.465489372,2.362125076},
+      { 3.396561688,4.400293529},
+      { 1.38807019,1.850220317 },
+      { 3.06407232,3.005305973 },
+      { 7.627531214,2.759262235 },
+      { 5.332441248,2.088626775 },
+      { 6.922596716,1.77106367 },
+      { 8.675418651,-0.242068655},
+      {7.673756466,3.508563011 },
+    },
+    2,
+    {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}
+  }
 };
 
 typedef KNNTest<float> KNNTestF;
@@ -120,4 +186,5 @@ TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); }
 
 INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs));
 
+} // namespace knn
 } // namespace raft

From 418417fdc3fc6a70d7400fb1ba6c9b237fc492d5 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 13 Jan 2021 17:28:57 -0600
Subject: [PATCH 08/19] Find faiss in conda env

---
 cpp/CMakeLists.txt                   | 28 ++++----
 cpp/cmake/Dependencies.cmake         |  7 +-
 cpp/cmake/templates/CMakeLists.txt   | 51 ---------------
 cpp/cmake/templates/Findpkg.cmake.in | 98 ----------------------------
 4 files changed, 18 insertions(+), 166 deletions(-)
 delete mode 100644 cpp/cmake/templates/CMakeLists.txt
 delete mode 100644 cpp/cmake/templates/Findpkg.cmake.in

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 82d7c3d2f0..6e4530a707 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -14,7 +14,6 @@
 # limitations under the License.
 #=============================================================================
 
-
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 project(RAFT VERSION 0.18.0 LANGUAGES CXX CUDA)
@@ -67,18 +66,6 @@ set(GPU_ARCHS "" CACHE STRING
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
-# Create FindPackage.cmake files to use find(package) functionality for
-# dependencies
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates)
-
-if(NOT BUILD_STATIC_FAISS)
-  GENERATE_FIND_MODULE(
-    NAME         FAISS
-    HEADER_NAME  faiss/IndexFlat.h
-    LIBRARY_NAME faiss)
-endif(NOT BUILD_STATIC_FAISS)
-
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 find_package(CUDA 10.0 REQUIRED)
@@ -189,6 +176,18 @@ endif()
 include(cmake/Dependencies.cmake)
 include(cmake/comms.cmake)
 
+###################################################################################################
+# - FAISS -------------------------------------------------------------------------------------------
+
+if(NOT BUILD_STATIC_FAISS)
+ find_path(FAISS_INCLUDE_DIRS "faiss"
+    HINTS
+    "$ENV{FAISS_ROOT}/include"
+    "$ENV{CONDA_PREFIX}/include/faiss"
+    "$ENV{CONDA_PREFIX}/include")
+endif(NOT BUILD_STATIC_FAISS)
+message(STATUS "FAISS: FAISS_INCLUDE_DIRS set to ${FAISS_INCLUDE_DIRS}")
+
 ###################################################################################################
 # - RMM -------------------------------------------------------------------------------------------
 
@@ -226,7 +225,6 @@ endif(DEFINED ENV{CONDA_PREFIX})
 # - libraries ----------------------------------------------------------------
 
 set(RAFT_LINK_LIBRARIES
-  FAISS::FAISS
   ${CUDA_cublas_LIBRARY}
   ${CUDA_cusolver_LIBRARY}
   ${CUDA_CUDART_LIBRARY}
@@ -234,6 +232,7 @@ set(RAFT_LINK_LIBRARIES
   ${CUDA_curand_LIBRARY})
 
 set(RAFT_LINK_DIRECTORIES
+  ${FAISS_INCLUDE_DIRS}
   ${RMM_INCLUDE_DIRS})
 
 if(DEFINED ENV{CONDA_PREFIX})
@@ -300,6 +299,7 @@ if(BUILD_RAFT_TESTS)
   target_link_libraries(test_raft
     PRIVATE
       ${RAFT_LINK_LIBRARIES}
+      FAISS::FAISS
       GTest::GTest
       GTest::Main
       OpenMP::OpenMP_CXX
diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
index c5dd705003..080efb5b1f 100644
--- a/cpp/cmake/Dependencies.cmake
+++ b/cpp/cmake/Dependencies.cmake
@@ -66,8 +66,10 @@ if(BUILD_STATIC_FAISS)
   # set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss")
   set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src")
 else()
-  set(FAISS_INSTALL_DIR ENV{FAISS_ROOT})
-  find_package(FAISS REQUIRED)
+  add_library(FAISS::FAISS SHARED IMPORTED)
+  set_property(TARGET FAISS::FAISS PROPERTY
+    IMPORTED_LOCATION $ENV{CONDA_PREFIX}/lib/libfaiss.so)
+  message(STATUS "Found FAISS: $ENV{CONDA_PREFIX}/lib/libfaiss.so")
 endif(BUILD_STATIC_FAISS)
 
 ##############################################################################
@@ -106,5 +108,4 @@ endif(BUILD_GTEST)
 if(NOT CUB_IS_PART_OF_CTK)
   add_dependencies(GTest::GTest cub)
 endif(NOT CUB_IS_PART_OF_CTK)
-add_dependencies(FAISS::FAISS benchmark)
 add_dependencies(FAISS::FAISS faiss)
diff --git a/cpp/cmake/templates/CMakeLists.txt b/cpp/cmake/templates/CMakeLists.txt
deleted file mode 100644
index 5bf20c9061..0000000000
--- a/cpp/cmake/templates/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-# File to generate Find*.cmake modules to find packages.
-# This file uses Findpkd.cmake.in to generate modules so that we can use
-# find(package) in the main CMake for any package we want.
-#
-# To use, include in CMakeLists.txt this folder, and then use the function
-# GENERATE_FIND_MODULE, which takes the following parameters:
-#       NAME:
-#           Name of the package to find in find(NAME)
-#       HEADER_NAME:
-#           Name of header file to use to find include dirs path of the package.
-#       LIBRARY_NAME:
-#           (Optional) Name of library to find to find include lib path of the package.
-#           Assumed to be lib${NAME} if left empty.
-#       LOCATION:
-#           (Optional) Name of additional folder to look for headers/lib files.
-#           Useful if one wants to reduce size of #include commands
-#       VERSION:
-#           (Optional) Version of the package. Useful to find libraries that append
-#           version number to their filename (for example libopenblas.so.0)
-
-
-function(GENERATE_FIND_MODULE)
-    set(oneValueArgs NAME LOCATION VERSION)
-    set(multiValueArgs HEADER_NAME LIBRARY_NAME)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN} )
-
-    if(NOT DEFINED PKG_LIBRARY_NAME)
-        set(PKG_LIBRARY_NAME ${PKG_NAME})
-    endif(NOT DEFINED PKG_LIBRARY_NAME)
-
-    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/Findpkg.cmake.in
-               ${CMAKE_CURRENT_BINARY_DIR}/cmake/Find${PKG_NAME}.cmake
-               @ONLY)
-endfunction()
-
diff --git a/cpp/cmake/templates/Findpkg.cmake.in b/cpp/cmake/templates/Findpkg.cmake.in
deleted file mode 100644
index a81f167d62..0000000000
--- a/cpp/cmake/templates/Findpkg.cmake.in
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Based on FindPNG.cmake from cmake 3.14.3
-
-#[=======================================================================[.rst:
-Find@PKG_NAME@
---------
-
-Template to generate FindPKG_NAME.cmake CMake modules
-
-Find @PKG_NAME@
-
-Imported targets
-^^^^^^^^^^^^^^^^
-
-This module defines the following :prop_tgt:`IMPORTED` target:
-
-``@PKG_NAME@::@PKG_NAME@``
-  The lib@PKG_NAME@ library, if found.
-
-Result variables
-^^^^^^^^^^^^^^^^
-
-This module will set the following variables in your project:
-
-``@PKG_NAME@_INCLUDE_DIRS``
-  where to find @PKG_NAME@.hpp , etc.
-``@PKG_NAME@_LIBRARIES``
-  the libraries to link against to use lib@PKG_NAME@.
-``@PKG_NAME@_FOUND``
-  If false, do not try to use @PKG_NAME@.
-``@PKG_NAME@_VERSION_STRING``
-  the version of the @PKG_NAME@ library found
-
-#]=======================================================================]
-
-find_path(@PKG_NAME@_LOCATION @PKG_HEADER_NAME@
-          HINTS ${@PKG_NAME@_INSTALL_DIR}
-          PATH_SUFFIXES include include/@PKG_LOCATION@)
-
-list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@ lib@PKG_LIBRARY_NAME@)
-set(_@PKG_NAME@_VERSION_SUFFIXES @PKG_VERSION@)
-
-foreach(v IN LISTS _@PKG_NAME@_VERSION_SUFFIXES)
-  list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@${v} lib@PKG_LIBRARY_NAME@${v})
-  list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@.${v} lib@PKG_LIBRARY_NAME@.${v})
-endforeach()
-unset(_@PKG_NAME@_VERSION_SUFFIXES)
-
-find_library(@PKG_NAME@_LIBRARY_RELEASE NAMES ${@PKG_NAME@_NAMES}
-             HINTS ${@PKG_NAME@_INSTALL_DIR}
-             PATH_SUFFIXES lib)
-
-include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
-select_library_configurations(@PKG_NAME@)
-mark_as_advanced(@PKG_NAME@_LIBRARY_RELEASE)
-unset(@PKG_NAME@_NAMES)
-
-# Set by select_library_configurations(), but we want the one from
-# find_package_handle_standard_args() below.
-unset(@PKG_NAME@_FOUND)
-
-if (@PKG_NAME@_LIBRARY AND @PKG_NAME@_LOCATION)
-  set(@PKG_NAME@_INCLUDE_DIRS ${@PKG_NAME@_LOCATION} )
-  set(@PKG_NAME@_LIBRARY ${@PKG_NAME@_LIBRARY})
-
-  if(NOT TARGET @PKG_NAME@::@PKG_NAME@)
-    add_library(@PKG_NAME@::@PKG_NAME@ UNKNOWN IMPORTED)
-    set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${@PKG_NAME@_INCLUDE_DIRS}")
-    if(EXISTS "${@PKG_NAME@_LIBRARY}")
-      set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES
-        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${@PKG_NAME@_LIBRARY}")
-    endif()
-  endif()
-endif ()
-
-
-include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
-find_package_handle_standard_args(@PKG_NAME@
-                                  REQUIRED_VARS @PKG_NAME@_LIBRARY @PKG_NAME@_LOCATION
-                                  VERSION_VAR @PKG_NAME@_VERSION_STRING)
-
-mark_as_advanced(@PKG_NAME@_LOCATION @PKG_NAME@_LIBRARY)

From b47311168a5affac138401a2f0c6d9a683b4f2d1 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 13 Jan 2021 20:17:27 -0600
Subject: [PATCH 09/19] Remove warning in loop

---
 cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index 432c3b2cf5..0e7206edca 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -211,7 +211,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
     // from the local partitions
     id_ranges = new std::vector<int64_t>();
     int64_t total_n = 0;
-    for (int i = 0; i < input.size(); i++) {
+    for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
       total_n += sizes[i];
     }
@@ -228,7 +228,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
     input.size());
-  for (int i = 0; i < input.size(); i++) {
+  for (size_t i = 0; i < input.size(); i++) {
     metric_processors[i] = create_processor<float>(
       metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
     metric_processors[i]->preprocess(input[i]);
@@ -258,7 +258,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   // Sync user stream only if using other streams to parallelize query
   if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
-  for (int i = 0; i < input.size(); i++) {
+  for (size_t i = 0; i < input.size(); i++) {
     faiss::gpu::StandardGpuResources gpu_res;
 
     cudaStream_t stream =
@@ -322,7 +322,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   query_metric_processor->revert(search_items);
   query_metric_processor->postprocess(out_D);
-  for (int i = 0; i < input.size(); i++) {
+  for (size_t i = 0; i < input.size(); i++) {
     metric_processors[i]->revert(input[i]);
   }
 

From cfbe32a4759b58f63c9ffcc6640d6ae23b5c6255 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Fri, 15 Jan 2021 19:19:07 -0600
Subject: [PATCH 10/19] Update to raft distance type

---
 .../spatial/knn/detail/brute_force_knn.hpp    | 26 ++++++++-----
 .../raft/spatial/knn/detail/processing.hpp    |  9 ++---
 cpp/include/raft/spatial/knn/detail/utils.hpp | 39 -------------------
 cpp/include/raft/spatial/knn/knn.hpp          |  7 ++--
 cpp/test/spatial/knn.cu                       |  7 +---
 5 files changed, 26 insertions(+), 62 deletions(-)
 delete mode 100644 cpp/include/raft/spatial/knn/detail/utils.hpp

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index 0e7206edca..b94bab57af 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -1,8 +1,5 @@
 #pragma once
 
-#include "processing.hpp"
-#include "utils.hpp"
-
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
@@ -16,9 +13,12 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 #include <iostream>
 #include <set>
 
+#include "processing.hpp"
+
 namespace raft {
 namespace knn {
 namespace detail {
@@ -149,14 +149,20 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
 }
 
-inline faiss::MetricType build_faiss_metric(MetricType metric) {
+inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) {
   switch (metric) {
-    case MetricType::METRIC_Cosine:
-      return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case MetricType::METRIC_Correlation:
-      return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case distance::DistanceType::EucUnexpandedL2:
+      return faiss::MetricType::METRIC_L2;
+    case distance::DistanceType::EucUnexpandedL1:
+      return faiss::MetricType::METRIC_L1;
+    case distance::DistanceType::ChebyChev:
+      return faiss::MetricType::METRIC_Linf;
+    case distance::DistanceType::Minkowski:
+      return faiss::MetricType::METRIC_Lp;
+    case distance::DistanceType::Canberra:
+      return faiss::MetricType::METRIC_Canberra;
     default:
-      return (faiss::MetricType)metric;
+      return faiss::MetricType::METRIC_INNER_PRODUCT;
   }
 }
 
@@ -196,7 +202,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 													int n_int_streams = 0, bool rowMajorIndex = true,
 													bool rowMajorQuery = true,
 													std::vector<int64_t> *translations = nullptr,
-													MetricType metric = MetricType::METRIC_L2,
+													distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2,
 													float metricArg = 2.0, bool expanded_form = false) {
 
  ASSERT(input.size() == sizes.size(),
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 57c7e1050c..ca779f45c3 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,8 +15,7 @@
  */
 #pragma once
 
-#include "utils.hpp"
-
+#include <raft/linalg/distance_type.h>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
@@ -160,17 +159,17 @@ class DefaultMetricProcessor : public MetricProcessor<math_t> {
 
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  MetricType metric, int n, int D, int k, bool rowMajorQuery,
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
   cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
   MetricProcessor<math_t> *mp = nullptr;
 
   switch (metric) {
-    case MetricType::METRIC_Cosine:
+    case distance::DistanceType::EucExpandedCosine:
       mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
                                              allocator);
       break;
 
-    case MetricType::METRIC_Correlation:
+    case distance::DistanceType::Correlation:
       mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
                                                   userStream, allocator);
       break;
diff --git a/cpp/include/raft/spatial/knn/detail/utils.hpp b/cpp/include/raft/spatial/knn/detail/utils.hpp
deleted file mode 100644
index c8542772f7..0000000000
--- a/cpp/include/raft/spatial/knn/detail/utils.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace raft {
-namespace knn {
-
-  enum MetricType {
-    METRIC_INNER_PRODUCT = 0,
-    METRIC_L2,
-    METRIC_L1,
-    METRIC_Linf,
-    METRIC_Lp,
-
-    METRIC_Canberra = 20,
-    METRIC_BrayCurtis,
-    METRIC_JensenShannon,
-
-    METRIC_Cosine = 100,
-    METRIC_Correlation
-  };
-
-}  // namespace knn
-}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index b6e3ed09ab..2a438c2f69 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -51,9 +51,10 @@ using deviceAllocator = raft::mr::device::allocator;
  */
 void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
                      std::vector<int> &sizes, int D, float *search_items, int n,
-                     int64_t *res_I, float *res_D, int k, bool rowMajorIndex,
-                     bool rowMajorQuery, MetricType metric, float metric_arg,
-                     bool expanded) {
+                     int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false,
+                     bool rowMajorQuery = false,
+                     distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2,
+                     float metric_arg = 2.0f, bool expanded = false) {
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
 
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index bfacc4935e..680d641f27 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -19,6 +19,7 @@
 #include <vector>
 #include <rmm/device_buffer.hpp>
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/linalg/distance_type.h>
 
 namespace raft {
 namespace knn {
@@ -81,10 +82,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
                     distances_,
                     k_,
                     true,
-                    true,
-                    MetricType::METRIC_L2,
-                    0.0,
-                    false);
+                    true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
         actual_labels_, rows_,
@@ -127,7 +125,6 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
 
     raft::allocate(input_, rows_ * cols_, true);
     raft::allocate(search_data_, rows_ * cols_, true);
-    raft::allocate(search_data_, rows_ * cols_, true);
     raft::allocate(indices_, rows_ * k_, true);
     raft::allocate(distances_, rows_ * k_, true);
     raft::allocate(search_labels_, rows_, true);

From e480951f69ca2bb28fd600d52e6f8aa6551edb58 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Tue, 26 Jan 2021 15:19:12 -0600
Subject: [PATCH 11/19] Update distances and add assert in test

---
 .../raft/spatial/knn/detail/brute_force_knn.hpp    | 14 +++++++++-----
 cpp/include/raft/spatial/knn/detail/processing.hpp |  4 ++--
 cpp/include/raft/spatial/knn/knn.hpp               |  2 +-
 cpp/test/spatial/knn.cu                            | 11 ++++++++---
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index b94bab57af..ce86b2ce77 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -151,16 +151,20 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
 
 inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) {
   switch (metric) {
-    case distance::DistanceType::EucUnexpandedL2:
+    case distance::DistanceType::L2Unexpanded:
       return faiss::MetricType::METRIC_L2;
-    case distance::DistanceType::EucUnexpandedL1:
+    case distance::DistanceType::L1:
       return faiss::MetricType::METRIC_L1;
-    case distance::DistanceType::ChebyChev:
+    case distance::DistanceType::Linf:
       return faiss::MetricType::METRIC_Linf;
-    case distance::DistanceType::Minkowski:
+    case distance::DistanceType::LpUnexpanded:
       return faiss::MetricType::METRIC_Lp;
     case distance::DistanceType::Canberra:
       return faiss::MetricType::METRIC_Canberra;
+    case distance::DistanceType::BrayCurtis:
+      return faiss::MetricType::METRIC_BrayCurtis;
+    case distance::DistanceType::JensenShannon:
+      return faiss::MetricType::METRIC_JensenShannon;
     default:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
   }
@@ -202,7 +206,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 													int n_int_streams = 0, bool rowMajorIndex = true,
 													bool rowMajorQuery = true,
 													std::vector<int64_t> *translations = nullptr,
-													distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2,
+													distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
 													float metricArg = 2.0, bool expanded_form = false) {
 
  ASSERT(input.size() == sizes.size(),
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index ca779f45c3..60f6b13800 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -164,12 +164,12 @@ inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
   MetricProcessor<math_t> *mp = nullptr;
 
   switch (metric) {
-    case distance::DistanceType::EucExpandedCosine:
+    case distance::DistanceType::CosineExpanded:
       mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
                                              allocator);
       break;
 
-    case distance::DistanceType::Correlation:
+    case distance::DistanceType::CorrelationExpanded:
       mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
                                                   userStream, allocator);
       break;
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 2a438c2f69..556ddba86e 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -53,7 +53,7 @@ void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
                      std::vector<int> &sizes, int D, float *search_items, int n,
                      int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false,
                      bool rowMajorQuery = false,
-                     distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2,
+                     distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
                      float metric_arg = 2.0f, bool expanded = false) {
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 680d641f27..27ff4a19d9 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,10 +16,11 @@
 
 #include <gtest/gtest.h>
 #include <iostream>
-#include <vector>
-#include <rmm/device_buffer.hpp>
-#include <raft/spatial/knn/knn.hpp>
 #include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/knn.hpp>
+#include <rmm/device_buffer.hpp>
+#include <vector>
+#include "../test_utils.h"
 
 namespace raft {
 namespace knn {
@@ -101,6 +102,10 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
          rows_ * k_, std::cout);
     raft::print_device_vector("Expected labels: ", expected_labels_,
          rows_ * k_, std::cout);
+
+    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_,
+          rows_ * k_,
+          raft::Compare<int>()));
   }
 
   void SetUp() override {

From 0ba85770c76bde649a16dfbb47d45f5429f3a2c7 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 27 Jan 2021 12:43:27 -0600
Subject: [PATCH 12/19] Clean pr

---
 cpp/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6e4530a707..5db1504e57 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -66,9 +66,10 @@ set(GPU_ARCHS "" CACHE STRING
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
+find_package(CUDA 10.0 REQUIRED)
+
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
-find_package(CUDA 10.0 REQUIRED)
 
 ##############################################################################
 # - Compiler Options  --------------------------------------------------------

From 63a634c890f7bd67ecf2763ce0f5a0293b0e60e0 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 27 Jan 2021 12:44:13 -0600
Subject: [PATCH 13/19] Clean pr

---
 cpp/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5db1504e57..283ae2f83a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -70,7 +70,6 @@ find_package(CUDA 10.0 REQUIRED)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
-
 ##############################################################################
 # - Compiler Options  --------------------------------------------------------
 

From f3c5f2b43cae43925040cf79906162d0f0e31ec7 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 27 Jan 2021 12:45:00 -0600
Subject: [PATCH 14/19] Clean cmake

---
 cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 283ae2f83a..fe9f927348 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -14,6 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
+
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 project(RAFT VERSION 0.18.0 LANGUAGES CXX CUDA)

From b7c90960ef90d65fb91793ca07f4f555a8c93e25 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 27 Jan 2021 12:47:13 -0600
Subject: [PATCH 15/19] Uncomment tests

---
 cpp/CMakeLists.txt | 78 +++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fe9f927348..2c39b30881 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -248,45 +248,45 @@ if(BUILD_RAFT_TESTS)
 
   # keep the files in alphabetical order!
   add_executable(test_raft
-    #  test/cudart_utils.cpp
-    #test/handle.cpp
-    #test/integer_utils.cpp
-    #test/lap/lap.cu
-    #test/linalg/add.cu
-    #test/linalg/binary_op.cu
-    #test/linalg/cholesky_r1.cu
-    #test/linalg/coalesced_reduction.cu
-    #test/linalg/divide.cu
-    #test/linalg/eig.cu
-    #test/linalg/eig_sel.cu
-    #test/linalg/gemm_layout.cu
-    #test/linalg/map_then_reduce.cu
-    #test/linalg/matrix_vector_op.cu
-    #test/linalg/multiply.cu
-    #test/linalg/norm.cu
-    #test/linalg/reduce.cu
-    #test/linalg/strided_reduction.cu
-    #test/linalg/subtract.cu
-    #test/linalg/svd.cu
-    #test/linalg/transpose.cu
-    #test/linalg/unary_op.cu
-    #test/matrix/math.cu
-    #test/matrix/matrix.cu
-    #test/mr/device/buffer.cpp
-    #test/mr/host/buffer.cpp
-    #test/random/rng.cu
-    #test/random/rng_int.cu
-    #test/random/sample_without_replacement.cu
-    test/spatial/knn.cu)
-    #test/stats/mean.cu
-    #test/stats/mean_center.cu
-    #test/stats/stddev.cu
-    #test/stats/sum.cu
-    #test/test.cpp
-    #test/spectral_matrix.cu
-    #test/eigen_solvers.cu
-    #test/cluster_solvers.cu
-    #test/mst.cu)
+    test/cudart_utils.cpp
+    test/handle.cpp
+    test/integer_utils.cpp
+    test/lap/lap.cu
+    test/linalg/add.cu
+    test/linalg/binary_op.cu
+    test/linalg/cholesky_r1.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/reduce.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
+    test/mr/device/buffer.cpp
+    test/mr/host/buffer.cpp
+    test/random/rng.cu
+    test/random/rng_int.cu
+    test/random/sample_without_replacement.cu
+    test/spatial/knn.cu
+    test/stats/mean.cu
+    test/stats/mean_center.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
+    test/test.cpp
+    test/spectral_matrix.cu
+    test/eigen_solvers.cu
+    test/cluster_solvers.cu
+    test/mst.cu)
 
   target_include_directories(test_raft
     PRIVATE

From c247f250bd85e8127d9c1d95f7052a6dc5b2cf62 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Wed, 27 Jan 2021 17:52:09 -0600
Subject: [PATCH 16/19] Clang format

---
 .../spatial/knn/detail/brute_force_knn.hpp    |  40 +++----
 .../raft/spatial/knn/detail/processing.hpp    |   4 +-
 cpp/include/raft/spatial/knn/knn.hpp          |  18 +--
 cpp/test/spatial/knn.cu                       | 113 ++++++++----------
 4 files changed, 77 insertions(+), 98 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index ce86b2ce77..caf79265c6 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -10,11 +10,11 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
+#include <raft/linalg/distance_type.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <iostream>
+#include <raft/handle.hpp>
 #include <set>
 
 #include "processing.hpp"
@@ -197,19 +197,16 @@ inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) {
  * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root
  */
 template <typename IntType = int>
-void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
-													IntType D, float *search_items, IntType n, int64_t *res_I,
-													float *res_D, IntType k,
-													std::shared_ptr<raft::mr::device::allocator> allocator,
-													cudaStream_t userStream,
-													cudaStream_t *internalStreams = nullptr,
-													int n_int_streams = 0, bool rowMajorIndex = true,
-													bool rowMajorQuery = true,
-													std::vector<int64_t> *translations = nullptr,
-													distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
-													float metricArg = 2.0, bool expanded_form = false) {
-
- ASSERT(input.size() == sizes.size(),
+void brute_force_knn_impl(
+  std::vector<float *> &input, std::vector<int> &sizes, IntType D,
+  float *search_items, IntType n, int64_t *res_I, float *res_D, IntType k,
+  std::shared_ptr<raft::mr::device::allocator> allocator,
+  cudaStream_t userStream, cudaStream_t *internalStreams = nullptr,
+  int n_int_streams = 0, bool rowMajorIndex = true, bool rowMajorQuery = true,
+  std::vector<int64_t> *translations = nullptr,
+  distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+  float metricArg = 2.0, bool expanded_form = false) {
+  ASSERT(input.size() == sizes.size(),
          "input and sizes vectors should be the same size");
 
   faiss::MetricType m = detail::build_faiss_metric(metric);
@@ -247,7 +244,8 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
+  raft::mr::device::buffer<int64_t> trans(allocator, userStream,
+                                          id_ranges->size());
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
                       userStream);
 
@@ -312,8 +310,8 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
+    detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k,
+                            userStream, trans.data());
   }
 
   // Perform necessary post-processing
@@ -339,6 +337,6 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   if (translations == nullptr) delete id_ranges;
 }
 
-} // namespace detail
-} // namespace knn
-} // namespace raft
+}  // namespace detail
+}  // namespace knn
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 60f6b13800..1b6589100f 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -19,10 +19,10 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
 
 namespace raft {
 namespace knn {
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 556ddba86e..668f5291e1 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -22,7 +22,7 @@
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
-  namespace knn {
+namespace knn {
 
 using deviceAllocator = raft::mr::device::allocator;
 
@@ -49,12 +49,12 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param[in] expanded should lp-based distances be returned in their expanded
  * 					 form (e.g., without raising to the 1/p power).
  */
-void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
-                     std::vector<int> &sizes, int D, float *search_items, int n,
-                     int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false,
-                     bool rowMajorQuery = false,
-                     distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
-                     float metric_arg = 2.0f, bool expanded = false) {
+void brute_force_knn(
+  raft::handle_t &handle, std::vector<float *> &input, std::vector<int> &sizes,
+  int D, float *search_items, int n, int64_t *res_I, float *res_D, int k,
+  bool rowMajorIndex = false, bool rowMajorQuery = false,
+  distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+  float metric_arg = 2.0f, bool expanded = false) {
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
 
@@ -67,5 +67,5 @@ void brute_force_knn(raft::handle_t &handle, std::vector<float *> &input,
     metric, metric_arg, expanded);
 }
 
-} // namespace knn
-} // namespace raft
+}  // namespace knn
+}  // namespace raft
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 27ff4a19d9..37bb859a53 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -15,8 +15,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <iostream>
 #include <raft/linalg/distance_type.h>
+#include <iostream>
 #include <raft/spatial/knn/knn.hpp>
 #include <rmm/device_buffer.hpp>
 #include <vector>
@@ -24,11 +24,11 @@
 
 namespace raft {
 namespace knn {
-  struct KNNInputs {
-    std::vector<std::vector<float>> input;
-    int k;
-    std::vector<int> labels;
-  };
+struct KNNInputs {
+  std::vector<std::vector<float>> input;
+  int k;
+  std::vector<int> labels;
+};
 
 __global__ void build_actual_output(int *output, int n_rows, int k,
                                     const int *idx_labels,
@@ -55,12 +55,11 @@ template <typename T>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
   void testBruteForce() {
-
-    raft::print_device_vector("Input array: ", input_,
-         rows_ * cols_, std::cout);
+    raft::print_device_vector("Input array: ", input_, rows_ * cols_,
+                              std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_,
-         rows_, std::cout);
+    raft::print_device_vector("Labels array: ", search_labels_, rows_,
+                              std::cout);
 
     auto stream = handle_.get_stream();
 
@@ -72,40 +71,26 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     input_vec.push_back(input_);
     sizes_vec.push_back(rows_);
 
-
-		brute_force_knn(handle_,
-										input_vec,
-                    sizes_vec,
-                    cols_,
-                    search_data_,
-                    rows_,
-                    indices_,
-                    distances_,
-                    k_,
-                    true,
-                    true);
+    brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_,
+                    indices_, distances_, k_, true, true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
-        actual_labels_, rows_,
-        k_, search_labels_,
-        indices_);
+      actual_labels_, rows_, k_, search_labels_, indices_);
 
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
-        expected_labels_, rows_,
-        k_, search_labels_);
-
-    raft::print_device_vector("Output indices: ", indices_,
-         rows_ * k_, std::cout);
-    raft::print_device_vector("Output distances: ", distances_,
-         rows_ * k_, std::cout);
-    raft::print_device_vector("Output labels: ", actual_labels_,
-         rows_ * k_, std::cout);
-    raft::print_device_vector("Expected labels: ", expected_labels_,
-         rows_ * k_, std::cout);
-
-    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_,
-          rows_ * k_,
-          raft::Compare<int>()));
+      expected_labels_, rows_, k_, search_labels_);
+
+    raft::print_device_vector("Output indices: ", indices_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Output distances: ", distances_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_,
+                              std::cout);
+
+    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_,
+                            raft::Compare<int>()));
   }
 
   void SetUp() override {
@@ -120,12 +105,12 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
         row_major_input.push_back(params_.input[i][j]);
       }
     }
-    rmm::device_buffer input_d = rmm::device_buffer(row_major_input.data(),
-        row_major_input.size() * sizeof(float));
+    rmm::device_buffer input_d = rmm::device_buffer(
+      row_major_input.data(), row_major_input.size() * sizeof(float));
     float *input_ptr = static_cast<float *>(input_d.data());
 
-    rmm::device_buffer labels_d = rmm::device_buffer(params_.labels.data(),
-        params_.labels.size() * sizeof(int));
+    rmm::device_buffer labels_d = rmm::device_buffer(
+      params_.labels.data(), params_.labels.size() * sizeof(int));
     int *labels_ptr = static_cast<int *>(labels_d.data());
 
     raft::allocate(input_, rows_ * cols_, true);
@@ -154,7 +139,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   float *input_;
   float *search_data_;
   int64_t *indices_;
-  float* distances_;
+  float *distances_;
   int k_;
 
   int *search_labels_;
@@ -162,31 +147,27 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   int *expected_labels_;
 };
 
-
 const std::vector<KNNInputs> inputs = {
   // 2D
-  {
-    {
-      { 2.7810836,2.550537003 },
-      { 1.465489372,2.362125076},
-      { 3.396561688,4.400293529},
-      { 1.38807019,1.850220317 },
-      { 3.06407232,3.005305973 },
-      { 7.627531214,2.759262235 },
-      { 5.332441248,2.088626775 },
-      { 6.922596716,1.77106367 },
-      { 8.675418651,-0.242068655},
-      {7.673756466,3.508563011 },
-    },
-    2,
-    {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}
-  }
-};
+  {{
+     {2.7810836, 2.550537003},
+     {1.465489372, 2.362125076},
+     {3.396561688, 4.400293529},
+     {1.38807019, 1.850220317},
+     {3.06407232, 3.005305973},
+     {7.627531214, 2.759262235},
+     {5.332441248, 2.088626775},
+     {6.922596716, 1.77106367},
+     {8.675418651, -0.242068655},
+     {7.673756466, 3.508563011},
+   },
+   2,
+   {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}};
 
 typedef KNNTest<float> KNNTestF;
 TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); }
 
 INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs));
 
-} // namespace knn
-} // namespace raft
+}  // namespace knn
+}  // namespace raft

From fe5a378c93b3e1a719b10ec9f72d6a5528e46a07 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Mon, 1 Feb 2021 11:50:44 -0600
Subject: [PATCH 17/19] Update license

---
 .../raft/spatial/knn/detail/brute_force_knn.hpp  | 16 ++++++++++++++++
 .../raft/spatial/knn/detail/processing.hpp       |  2 +-
 cpp/include/raft/spatial/knn/knn.hpp             |  2 +-
 cpp/test/spatial/knn.cu                          |  2 +-
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index caf79265c6..00316753d8 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #pragma once
 
 #include <raft/cudart_utils.h>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 1b6589100f..9e86ac0550 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c)2020, NVIDIA CORPORATION.
+ * Copyright (c)2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 668f5291e1..2bba8bfb6c 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 37bb859a53..683bcae7d9 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 9315752555050877f934522a08709b5dc2daff8a Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Mon, 1 Feb 2021 11:57:42 -0600
Subject: [PATCH 18/19] Add spatial namespace

---
 cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp | 2 ++
 cpp/include/raft/spatial/knn/detail/processing.hpp      | 6 ++++--
 cpp/include/raft/spatial/knn/knn.hpp                    | 2 ++
 cpp/test/spatial/knn.cu                                 | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
index 00316753d8..1ca5be2052 100644
--- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
+++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp
@@ -36,6 +36,7 @@
 #include "processing.hpp"
 
 namespace raft {
+namespace spatial {
 namespace knn {
 namespace detail {
 
@@ -355,4 +356,5 @@ void brute_force_knn_impl(
 
 }  // namespace detail
 }  // namespace knn
+}  // namespace spatial
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 9e86ac0550..a645412c2f 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -25,6 +25,7 @@
 #include <raft/stats/mean_center.cuh>
 
 namespace raft {
+namespace spatial {
 namespace knn {
 
 using deviceAllocator = raft::mr::device::allocator;
@@ -186,5 +187,6 @@ template class CosineMetricProcessor<float>;
 template class CorrelationMetricProcessor<float>;
 template class DefaultMetricProcessor<float>;
 
-};  // namespace knn
-};  // namespace raft
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 2bba8bfb6c..ccee635701 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -22,6 +22,7 @@
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
+namespace spatial {
 namespace knn {
 
 using deviceAllocator = raft::mr::device::allocator;
@@ -68,4 +69,5 @@ void brute_force_knn(
 }
 
 }  // namespace knn
+}  // namespace spatial
 }  // namespace raft
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 683bcae7d9..cfd4ecc9d1 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -23,6 +23,7 @@
 #include "../test_utils.h"
 
 namespace raft {
+namespace spatial {
 namespace knn {
 struct KNNInputs {
   std::vector<std::vector<float>> input;
@@ -170,4 +171,5 @@ TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); }
 INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs));
 
 }  // namespace knn
+}  // namespace spatial
 }  // namespace raft

From 03c5a3574d44b8552fead8a9e9cb0bd3c9d5adf3 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 1 Feb 2021 13:25:11 -0500
Subject: [PATCH 19/19] Prepare Changelog for Automation (#3442)

This PR prepares the changelog to be automatically updated during releases.

Authors:
  - AJ Schmidt (@ajschmidt8)

Approvers:
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/raft/pull/135
---
 CHANGELOG.md | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4747bca4e2..c1fdbbef24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,12 +1,8 @@
-# RAFT 0.18.0 (Date TBD)
+# 0.18.0
 
-## New Features
-
-## Improvements
-
-## Bug Fixes
+Please see https://github.com/rapidsai/raft/releases/tag/branch-0.18-latest for the latest changes to this development branch.
 
-# RAFT 0.17.0 (Date TBD)
+# RAFT 0.17.0 (10 Dec 2020)
 
 ## New Features
 - PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects