From f090fc0fd46909394145ed333546cf5979abac69 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 10 Dec 2020 16:15:59 -0500 Subject: [PATCH 01/19] add gather & gatherv to raft::comms_t --- cpp/include/raft/comms/comms.hpp | 48 ++++++++++++++++++++++++++++ cpp/include/raft/comms/mpi_comms.hpp | 31 ++++++++++++++++++ cpp/include/raft/comms/std_comms.hpp | 31 ++++++++++++++++++ 3 files changed, 110 insertions(+) diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 73e52e781b..0ca9f3972f 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -130,6 +130,15 @@ class comms_iface { const size_t* recvcounts, const size_t* displs, datatype_t datatype, cudaStream_t stream) const = 0; + virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, int root, + cudaStream_t stream) const = 0; + + virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, + const size_t* recvcounts, const size_t* displs, + datatype_t datatype, int root, + cudaStream_t stream) const = 0; + virtual void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const = 0; @@ -316,6 +325,45 @@ class comms_t { get_type(), stream); } + /** + * Gathers data from each rank onto all ranks + * @tparam value_t datatype of underlying buffers + * @param sendbuff buffer containing data to gather + * @param recvbuff buffer containing gathered data from all ranks + * @param sendcount number of elements in send buffer + * @param root rank to store the results + * @param stream CUDA stream to synchronize operation + */ + template + void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, + int root, cudaStream_t stream) const { + impl_->gather(static_cast(sendbuff), + static_cast(recvbuff), sendcount, get_type(), + root, stream); + } + + /** + * Gathers data from all ranks and delivers to combined data to all ranks + * @param value_t datatype of underlying buffers + * @param sendbuff buffer containing data to send + * @param recvbuff buffer containing data to receive + * @param sendcount number of elements in send buffer + * @param recvcounts pointer to an array (of length num_ranks size) containing the number of + * elements that are to be received from each rank + * @param displs pointer to an array (of length num_ranks size) to specify the displacement + * (relative to recvbuf) at which to place the incoming data from each rank + * @param root rank to store the results + * @param stream CUDA stream to synchronize operation + */ + template + void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, + const size_t* recvcounts, const size_t* displs, int root, + cudaStream_t stream) const { + impl_->gatherv(static_cast(sendbuf), + static_cast(recvbuf), sendcount, recvcounts, displs, + get_type(), root, stream); + } + /** * Reduces data from all ranks then scatters the result across ranks * @tparam value_t datatype of underlying buffers diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index a372702c34..dddc709fa7 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -232,6 +232,37 @@ class mpi_comms : public comms_iface { } } + void gather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, int root, cudaStream_t stream) const { + size_t dtype_size = get_datatype_size(datatype); + NCCL_TRY(ncclGroupStart()); + if (get_rank() == root) { + for (int r = 0; r < get_size(); ++r) { + NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); + } + } + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + nccl_comm_, stream)); + NCCL_TRY(ncclGroupEnd()); + } + + void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, + const size_t* recvcounts, const size_t* displs, + datatype_t datatype, int root, cudaStream_t stream) const { + size_t dtype_size = get_datatype_size(datatype); + NCCL_TRY(ncclGroupStart()); + if (get_rank() == root) { + for (int r = 0; r < get_size(); ++r) { + NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r], + get_nccl_datatype(datatype), r, nccl_comm_, stream)); + } + } + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + nccl_comm_, stream)); + NCCL_TRY(ncclGroupEnd()); + } + void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index d4b9d2ba39..3cafdc87c0 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -346,6 +346,37 @@ class std_comms : public comms_iface { } } + void gather(const void *sendbuff, void *recvbuff, size_t sendcount, + datatype_t datatype, int root, cudaStream_t stream) const { + size_t dtype_size = get_datatype_size(datatype); + NCCL_TRY(ncclGroupStart()); + if (get_rank() == root) { + for (int r = 0; r < get_size(); ++r) { + NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); + } + } + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + nccl_comm_, stream)); + NCCL_TRY(ncclGroupEnd()); + } + + void gatherv(const void *sendbuf, void *recvbuf, size_t sendcount, + const size_t *recvcounts, const size_t *displs, + datatype_t datatype, int root, cudaStream_t stream) const { + size_t dtype_size = get_datatype_size(datatype); + NCCL_TRY(ncclGroupStart()); + if (get_rank() == root) { + for (int r = 0; r < get_size(); ++r) { + NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r], + get_nccl_datatype(datatype), r, nccl_comm_, stream)); + } + } + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + nccl_comm_, stream)); + NCCL_TRY(ncclGroupEnd()); + } + void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const { NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, From 7a46dc9d2280b3a0f899ce2b8c5799f19395464c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 26 Jan 2021 15:58:41 -0500 Subject: [PATCH 02/19] fix build errors --- cpp/include/raft/comms/mpi_comms.hpp | 16 +++++++++------- cpp/include/raft/comms/std_comms.hpp | 16 +++++++++------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index dddc709fa7..8aebcc80cc 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -238,27 +238,29 @@ class mpi_comms : public comms_iface { NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, + void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, const size_t* recvcounts, const size_t* displs, datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], get_nccl_datatype(datatype), r, + nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 3cafdc87c0..a304955ceb 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -352,27 +352,29 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(recvbuff + sendcount * i * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void *sendbuf, void *recvbuf, size_t sendcount, + void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, const size_t *recvcounts, const size_t *displs, datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(recvbuff + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), r, + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } From e6180740181c986b11d5a3557bbab76932805530 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 26 Jan 2021 23:42:42 -0500 Subject: [PATCH 03/19] fix a bug in reducescatter test --- cpp/include/raft/comms/test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index fa7e471174..627d629e68 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -158,23 +158,23 @@ bool test_collective_allgather(const handle_t &handle, int root) { bool test_collective_reducescatter(const handle_t &handle, int root) { comms_t const &communicator = handle.get_comms(); - int const send = 1; + std::vector sends(communicator.get_size(), 1); cudaStream_t stream = handle.get_stream(); raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - 1); + sends.size()); raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); From d390789761098d88193cd325e81712173d13d14d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 27 Jan 2021 00:20:38 -0500 Subject: [PATCH 04/19] add python gather & gatherv tests --- cpp/include/raft/comms/test.hpp | 96 ++++++++++++++++++++++++- python/raft/dask/common/__init__.py | 2 + python/raft/dask/common/comms_utils.pyx | 32 +++++++++ python/raft/test/test_comms.py | 4 ++ 4 files changed, 131 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 627d629e68..5dc6f02d21 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -16,11 +16,13 @@ #pragma once -#include #include #include #include +#include +#include + namespace raft { namespace comms { @@ -155,6 +157,93 @@ bool test_collective_allgather(const handle_t &handle, int root) { return true; } +bool test_collective_gather(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); + + int const send = communicator.get_rank(); + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(1, stream); + + raft::mr::device::buffer recv_d( + handle.get_device_allocator(), stream, + communicator.get_rank() == root ? communicator.get_size() : 0); + + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); + + communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); + communicator.sync_stream(stream); + + if (communicator.get_rank() == root) { + std::vector temp_h(communicator.get_size(), 0); + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + sizeof(int) * temp_h.size(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int i = 0; i < communicator.get_size(); i++) { + if (temp_h[i] != i) return false; + } + } + return true; +} + +bool test_collective_gatherv(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); + + std::vector sendcounts(communicator.get_size()); + std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); + std::vector displacements(communicator.get_size() + 1, 0); + std::partial_sum(sendcounts.begin(), sendcounts.end(), + displacements.begin() + 1); + + std::vector sends(displacements[communicator.get_rank() + 1] - + displacements[communicator.get_rank()], + communicator.get_rank()); + + cudaStream_t stream = handle.get_stream(); + + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); + temp_d.resize(sends.size(), stream); + + raft::mr::device::buffer recv_d( + handle.get_device_allocator(), stream, + communicator.get_rank() == root ? displacements.back() : 0); + + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), + sends.size() * sizeof(int), cudaMemcpyHostToDevice, + stream)); + + communicator.gatherv( + temp_d.data(), recv_d.data(), temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() + : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() + : static_cast(nullptr), + root, stream); + communicator.sync_stream(stream); + + if (communicator.get_rank() == root) { + std::vector temp_h(displacements.back(), 0); + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + sizeof(int) * displacements.back(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int i = 0; i < communicator.get_size(); i++) { + if (std::count_if(temp_h.begin() + displacements[i], + temp_h.begin() + displacements[i + 1], + [i](auto val) { return val != i; }) != 0) { + return false; + } + } + } + return true; +} + bool test_collective_reducescatter(const handle_t &handle, int root) { comms_t const &communicator = handle.get_comms(); @@ -167,8 +256,9 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), + sends.size() * sizeof(int), cudaMemcpyHostToDevice, + stream)); communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py index 788af46c92..73bb5d6700 100644 --- a/python/raft/dask/common/__init__.py +++ b/python/raft/dask/common/__init__.py @@ -21,6 +21,8 @@ from .comms_utils import perform_test_comms_allreduce from .comms_utils import perform_test_comms_send_recv from .comms_utils import perform_test_comms_allgather +from .comms_utils import perform_test_comms_gather +from .comms_utils import perform_test_comms_gatherv from .comms_utils import perform_test_comms_bcast from .comms_utils import perform_test_comms_reduce from .comms_utils import perform_test_comms_reducescatter diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx index 4dbd2f1a7c..1a703485a9 100644 --- a/python/raft/dask/common/comms_utils.pyx +++ b/python/raft/dask/common/comms_utils.pyx @@ -60,6 +60,8 @@ cdef extern from "raft/comms/test.hpp" namespace "raft::comms": bool test_collective_broadcast(const handle_t &h, int root) except + bool test_collective_reduce(const handle_t &h, int root) except + bool test_collective_allgather(const handle_t &h, int root) except + + bool test_collective_gather(const handle_t &h, int root) except + + bool test_collective_gatherv(const handle_t &h, int root) except + bool test_collective_reducescatter(const handle_t &h, int root) except + bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) except + @@ -131,6 +133,36 @@ def perform_test_comms_allgather(handle, root): return test_collective_allgather(deref(h), root) +def perform_test_comms_gather(handle, root): + """ + Performs a gather on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + root : int + Rank of the root worker + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_gather(deref(h), root) + + +def perform_test_comms_gatherv(handle, root): + """ + Performs a gatherv on the current worker + + Parameters + ---------- + handle : raft.common.Handle + handle containing comms_t to use + root : int + Rank of the root worker + """ + cdef const handle_t* h = handle.getHandle() + return test_collective_gatherv(deref(h), root) + + def perform_test_comms_send_recv(handle, n_trials): """ Performs a p2p send/recv on the current worker diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py index 7dccb7bbae..a0db3b7f4f 100644 --- a/python/raft/test/test_comms.py +++ b/python/raft/test/test_comms.py @@ -28,6 +28,8 @@ from raft.dask.common import perform_test_comms_bcast from raft.dask.common import perform_test_comms_reduce from raft.dask.common import perform_test_comms_allgather + from raft.dask.common import perform_test_comms_gather + from raft.dask.common import perform_test_comms_gatherv from raft.dask.common import perform_test_comms_reducescatter from raft.dask.common import perform_test_comm_split @@ -130,6 +132,8 @@ def _has_handle(sessionId): perform_test_comms_allgather, perform_test_comms_allreduce, perform_test_comms_bcast, + perform_test_comms_gather, + perform_test_comms_gatherv, perform_test_comms_reduce, perform_test_comms_reducescatter, ] From 6a915c421b38cb1556b7fac1b162054b389e1785 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 6 Jan 2021 21:25:44 -0600 Subject: [PATCH 05/19] Add brute force knn --- build.sh | 8 +- cpp/CMakeLists.txt | 19 +- cpp/cmake/Dependencies.cmake | 44 +++- cpp/cmake/faiss_cuda11.patch | 40 +++ cpp/cmake/templates/CMakeLists.txt | 51 ++++ cpp/cmake/templates/Findpkg.cmake.in | 98 ++++++++ .../spatial/knn/detail/brute_force_knn.hpp | 163 ++++++++++++ cpp/include/raft/spatial/knn/detail/utils.hpp | 39 +++ cpp/include/raft/spatial/knn/knn.hpp | 237 ++++++++++++++++++ cpp/test/spatial/knn.cu | 123 +++++++++ 10 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 cpp/cmake/faiss_cuda11.patch create mode 100644 cpp/cmake/templates/CMakeLists.txt create mode 100644 cpp/cmake/templates/Findpkg.cmake.in create mode 100644 cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp create mode 100644 cpp/include/raft/spatial/knn/detail/utils.hpp create mode 100644 cpp/include/raft/spatial/knn/knn.hpp create mode 100644 cpp/test/spatial/knn.cu diff --git a/build.sh b/build.sh index 213aea9347..b05e002788 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest" +VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss" HELP="$0 [ ...] [ ...] where is: clean - remove all existing build artifacts and configuration (start over) @@ -29,6 +29,7 @@ HELP="$0 [ ...] [ ...] -v - verbose build mode -g - build for debug --allgpuarch - build for all supported GPU architectures + --buildfaiss - build faiss statically into raft --nvtx - Enable nvtx for profiling support --show_depr_warn - show cmake deprecation warnings -h - print this text @@ -44,6 +45,7 @@ BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}" VERBOSE="" BUILD_ALL_GPU_ARCH=0 BUILD_GTEST=OFF +BUILD_STATIC_FAISS=OFF SINGLEGPU="" NVTX=OFF CLEAN=0 @@ -89,6 +91,9 @@ fi if hasArg --buildgtest; then BUILD_GTEST=ON fi +if hasArg --buildfaiss; then + BUILD_STATIC_FAISS=ON +fi if hasArg --singlegpu; then SINGLEGPU="--singlegpu" fi @@ -140,6 +145,7 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft; then -DNCCL_PATH=${INSTALL_PREFIX} \ -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \ -DBUILD_GTEST=${BUILD_GTEST} \ + -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \ .. fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e62b4d11cc..691fbfdfd4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -42,6 +42,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(BUILD_GTEST "Build the GTEST library for running raft test executables" OFF) +option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF) + option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" ON) @@ -65,10 +67,22 @@ set(GPU_ARCHS "" CACHE STRING ############################################################################## # - Requirements ------------------------------------------------------------- -find_package(CUDA 10.0 REQUIRED) +# Create FindPackage.cmake files to use find(package) functionality for +# dependencies + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates) + +if(NOT BUILD_STATIC_FAISS) + GENERATE_FIND_MODULE( + NAME FAISS + HEADER_NAME faiss/IndexFlat.h + LIBRARY_NAME faiss) +endif(NOT BUILD_STATIC_FAISS) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +find_package(CUDA 10.0 REQUIRED) + ############################################################################## # - Compiler Options -------------------------------------------------------- @@ -196,6 +210,7 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING set(RAFT_INCLUDE_DIRECTORIES ${RAFT_INCLUDE_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${FAISS_INCLUDE_DIRS} ${RMM_INCLUDE_DIRS}) if(NOT CUB_IS_PART_OF_CTK) @@ -211,6 +226,7 @@ endif(DEFINED ENV{CONDA_PREFIX}) # - libraries ---------------------------------------------------------------- set(RAFT_LINK_LIBRARIES + FAISS::FAISS ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} @@ -261,6 +277,7 @@ if(BUILD_RAFT_TESTS) test/random/rng.cu test/random/rng_int.cu test/random/sample_without_replacement.cu + test/spatial/knn.cu test/stats/mean.cu test/stats/mean_center.cu test/stats/stddev.cu diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake index 64033327d6..c5dd705003 100644 --- a/cpp/cmake/Dependencies.cmake +++ b/cpp/cmake/Dependencies.cmake @@ -30,6 +30,46 @@ if(NOT CUB_IS_PART_OF_CTK) INSTALL_COMMAND "") endif(NOT CUB_IS_PART_OF_CTK) +############################################################################## +# - faiss -------------------------------------------------------------------- + +if(BUILD_STATIC_FAISS) + set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING + "Path to FAISS source directory") + ExternalProject_Add(faiss + GIT_REPOSITORY https://github.com/facebookresearch/faiss.git + GIT_TAG a5b850dec6f1cd6c88ab467bfd5e87b0cac2e41d + CONFIGURE_COMMAND LIBS=-pthread + CPPFLAGS=-w + LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib + ${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure + --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss + --with-blas=${BLAS_LIBRARIES} + --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} + --with-cuda-arch=${FAISS_GPU_ARCHS} + -v + PREFIX ${FAISS_DIR} + BUILD_COMMAND make -j${PARALLEL_LEVEL} VERBOSE=1 + BUILD_BYPRODUCTS ${FAISS_DIR}/lib/libfaiss.a + BUILD_ALWAYS 1 + INSTALL_COMMAND make -s install > /dev/null + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/faiss_cuda11.patch || true) + + ExternalProject_Get_Property(faiss install_dir) + add_library(FAISS::FAISS STATIC IMPORTED) + set_property(TARGET FAISS::FAISS PROPERTY + IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a) + # to account for the FAISS file reorg that happened recently after the current + # pinned commit, just change the following line to + # set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss") + set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src") +else() + set(FAISS_INSTALL_DIR ENV{FAISS_ROOT}) + find_package(FAISS REQUIRED) +endif(BUILD_STATIC_FAISS) + ############################################################################## # - googletest --------------------------------------------------------------- @@ -65,4 +105,6 @@ endif(BUILD_GTEST) if(NOT CUB_IS_PART_OF_CTK) add_dependencies(GTest::GTest cub) -endif(NOT CUB_IS_PART_OF_CTK) +endif(NOT CUB_IS_PART_OF_CTK) +add_dependencies(FAISS::FAISS benchmark) +add_dependencies(FAISS::FAISS faiss) diff --git a/cpp/cmake/faiss_cuda11.patch b/cpp/cmake/faiss_cuda11.patch new file mode 100644 index 0000000000..496ca0e7b2 --- /dev/null +++ b/cpp/cmake/faiss_cuda11.patch @@ -0,0 +1,40 @@ +diff --git a/configure b/configure +index ed40dae..f88ed0a 100755 +--- a/configure ++++ b/configure +@@ -2970,7 +2970,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex + ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + +- ax_cxx_compile_alternatives="11 0x" ax_cxx_compile_cxx11_required=true ++ ax_cxx_compile_alternatives="14 11 0x" ax_cxx_compile_cxx11_required=true + ac_ext=cpp + ac_cpp='$CXXCPP $CPPFLAGS' + ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh +index 89d3dda..bc0f9b5 100644 +--- a/gpu/utils/DeviceDefs.cuh ++++ b/gpu/utils/DeviceDefs.cuh +@@ -13,7 +13,7 @@ + namespace faiss { namespace gpu { + + #ifdef __CUDA_ARCH__ +-#if __CUDA_ARCH__ <= 750 ++#if __CUDA_ARCH__ <= 800 + constexpr int kWarpSize = 32; + #else + #error Unknown __CUDA_ARCH__; please define parameters for compute capability +diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh +index ede225e..4f7eb44 100644 +--- a/gpu/utils/MatrixMult-inl.cuh ++++ b/gpu/utils/MatrixMult-inl.cuh +@@ -51,6 +51,9 @@ rawGemm(cublasHandle_t handle, + auto cBT = GetCudaType::Type; + + // Always accumulate in f32 ++# if __CUDACC_VER_MAJOR__ >= 11 ++ cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); ++# endif + return cublasSgemmEx(handle, transa, transb, m, n, k, + &fAlpha, A, cAT, lda, + B, cBT, ldb, diff --git a/cpp/cmake/templates/CMakeLists.txt b/cpp/cmake/templates/CMakeLists.txt new file mode 100644 index 0000000000..5bf20c9061 --- /dev/null +++ b/cpp/cmake/templates/CMakeLists.txt @@ -0,0 +1,51 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +# File to generate Find*.cmake modules to find packages. +# This file uses Findpkd.cmake.in to generate modules so that we can use +# find(package) in the main CMake for any package we want. +# +# To use, include in CMakeLists.txt this folder, and then use the function +# GENERATE_FIND_MODULE, which takes the following parameters: +# NAME: +# Name of the package to find in find(NAME) +# HEADER_NAME: +# Name of header file to use to find include dirs path of the package. +# LIBRARY_NAME: +# (Optional) Name of library to find to find include lib path of the package. +# Assumed to be lib${NAME} if left empty. +# LOCATION: +# (Optional) Name of additional folder to look for headers/lib files. +# Useful if one wants to reduce size of #include commands +# VERSION: +# (Optional) Version of the package. Useful to find libraries that append +# version number to their filename (for example libopenblas.so.0) + + +function(GENERATE_FIND_MODULE) + set(oneValueArgs NAME LOCATION VERSION) + set(multiValueArgs HEADER_NAME LIBRARY_NAME) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + if(NOT DEFINED PKG_LIBRARY_NAME) + set(PKG_LIBRARY_NAME ${PKG_NAME}) + endif(NOT DEFINED PKG_LIBRARY_NAME) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/Findpkg.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/cmake/Find${PKG_NAME}.cmake + @ONLY) +endfunction() + diff --git a/cpp/cmake/templates/Findpkg.cmake.in b/cpp/cmake/templates/Findpkg.cmake.in new file mode 100644 index 0000000000..a81f167d62 --- /dev/null +++ b/cpp/cmake/templates/Findpkg.cmake.in @@ -0,0 +1,98 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Based on FindPNG.cmake from cmake 3.14.3 + +#[=======================================================================[.rst: +Find@PKG_NAME@ +-------- + +Template to generate FindPKG_NAME.cmake CMake modules + +Find @PKG_NAME@ + +Imported targets +^^^^^^^^^^^^^^^^ + +This module defines the following :prop_tgt:`IMPORTED` target: + +``@PKG_NAME@::@PKG_NAME@`` + The lib@PKG_NAME@ library, if found. + +Result variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project: + +``@PKG_NAME@_INCLUDE_DIRS`` + where to find @PKG_NAME@.hpp , etc. +``@PKG_NAME@_LIBRARIES`` + the libraries to link against to use lib@PKG_NAME@. +``@PKG_NAME@_FOUND`` + If false, do not try to use @PKG_NAME@. +``@PKG_NAME@_VERSION_STRING`` + the version of the @PKG_NAME@ library found + +#]=======================================================================] + +find_path(@PKG_NAME@_LOCATION @PKG_HEADER_NAME@ + HINTS ${@PKG_NAME@_INSTALL_DIR} + PATH_SUFFIXES include include/@PKG_LOCATION@) + +list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@ lib@PKG_LIBRARY_NAME@) +set(_@PKG_NAME@_VERSION_SUFFIXES @PKG_VERSION@) + +foreach(v IN LISTS _@PKG_NAME@_VERSION_SUFFIXES) + list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@${v} lib@PKG_LIBRARY_NAME@${v}) + list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@.${v} lib@PKG_LIBRARY_NAME@.${v}) +endforeach() +unset(_@PKG_NAME@_VERSION_SUFFIXES) + +find_library(@PKG_NAME@_LIBRARY_RELEASE NAMES ${@PKG_NAME@_NAMES} + HINTS ${@PKG_NAME@_INSTALL_DIR} + PATH_SUFFIXES lib) + +include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake) +select_library_configurations(@PKG_NAME@) +mark_as_advanced(@PKG_NAME@_LIBRARY_RELEASE) +unset(@PKG_NAME@_NAMES) + +# Set by select_library_configurations(), but we want the one from +# find_package_handle_standard_args() below. +unset(@PKG_NAME@_FOUND) + +if (@PKG_NAME@_LIBRARY AND @PKG_NAME@_LOCATION) + set(@PKG_NAME@_INCLUDE_DIRS ${@PKG_NAME@_LOCATION} ) + set(@PKG_NAME@_LIBRARY ${@PKG_NAME@_LIBRARY}) + + if(NOT TARGET @PKG_NAME@::@PKG_NAME@) + add_library(@PKG_NAME@::@PKG_NAME@ UNKNOWN IMPORTED) + set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${@PKG_NAME@_INCLUDE_DIRS}") + if(EXISTS "${@PKG_NAME@_LIBRARY}") + set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${@PKG_NAME@_LIBRARY}") + endif() + endif() +endif () + + +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +find_package_handle_standard_args(@PKG_NAME@ + REQUIRED_VARS @PKG_NAME@_LIBRARY @PKG_NAME@_LOCATION + VERSION_VAR @PKG_NAME@_VERSION_STRING) + +mark_as_advanced(@PKG_NAME@_LOCATION @PKG_NAME@_LIBRARY) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp new file mode 100644 index 0000000000..16b295b3ca --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -0,0 +1,163 @@ +#pragma once + +#include "utils.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace raft { +namespace knn { +namespace detail { + +template +__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, + value_t *outK, value_idx *outV, + size_t n_samples, int n_parts, + value_t initK, value_idx initV, int k, + value_idx *translations) { + constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; + + __shared__ value_t smemK[kNumWarps * warp_q]; + __shared__ value_idx smemV[kNumWarps * warp_q]; + + /** + * Uses shared memory + */ + faiss::gpu::BlockSelect, warp_q, thread_q, + tpb> + heap(initK, initV, smemK, smemV, k); + + // Grid is exactly sized to rows available + int row = blockIdx.x; + int total_k = k * n_parts; + + int i = threadIdx.x; + + // Get starting pointers for cols in current thread + int part = i / k; + size_t row_idx = (row * k) + (part * n_samples * k); + + int col = i % k; + + value_t *inKStart = inK + (row_idx + col); + value_idx *inVStart = inV + (row_idx + col); + + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + value_idx translation = 0; + + for (; i < limit; i += tpb) { + translation = translations[part]; + heap.add(*inKStart, (*inVStart) + translation); + + part = (i + tpb) / k; + row_idx = (row * k) + (part * n_samples * k); + + col = (i + tpb) % k; + + inKStart = inK + (row_idx + col); + inVStart = inV + (row_idx + col); + } + + // Handle last remainder fraction of a warp of elements + if (i < total_k) { + translation = translations[part]; + heap.addThreadQ(*inKStart, (*inVStart) + translation); + } + + heap.reduce(); + + for (int i = threadIdx.x; i < k; i += tpb) { + outK[row * k + i] = smemK[i]; + outV[row * k + i] = smemV[i]; + } +} + +template +inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, + value_idx *outV, size_t n_samples, int n_parts, + int k, cudaStream_t stream, + value_idx *translations) { + auto grid = dim3(n_samples); + + constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; + auto block = dim3(n_threads); + + auto kInit = faiss::gpu::Limits::getMax(); + auto vInit = -1; + knn_merge_parts_kernel + <<>>(inK, inV, outK, outV, n_samples, n_parts, + kInit, vInit, k, translations); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Merge knn distances and index matrix, which have been partitioned + * by row, into a single matrix with only the k-nearest neighbors. + * + * @param inK partitioned knn distance matrix + * @param inV partitioned knn index matrix + * @param outK merged knn distance matrix + * @param outV merged knn index matrix + * @param n_samples number of samples per partition + * @param n_parts number of partitions + * @param k number of neighbors per partition (also number of merged neighbors) + * @param stream CUDA stream to use + * @param translations mapping of index offsets for each partition + */ +template +inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, + value_idx *outV, size_t n_samples, int n_parts, + int k, cudaStream_t stream, + value_idx *translations) { + if (k == 1) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 32) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 64) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 128) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 256) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 512) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); + else if (k <= 1024) + knn_merge_parts_impl( + inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); +} + +inline faiss::MetricType build_faiss_metric(MetricType metric) { + switch (metric) { + case MetricType::METRIC_Cosine: + return faiss::MetricType::METRIC_INNER_PRODUCT; + case MetricType::METRIC_Correlation: + return faiss::MetricType::METRIC_INNER_PRODUCT; + default: + return (faiss::MetricType)metric; + } +} + +} // namespace detail +} // namespace knn +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/utils.hpp b/cpp/include/raft/spatial/knn/detail/utils.hpp new file mode 100644 index 0000000000..c8542772f7 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/utils.hpp @@ -0,0 +1,39 @@ + +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { +namespace knn { + + enum MetricType { + METRIC_INNER_PRODUCT = 0, + METRIC_L2, + METRIC_L1, + METRIC_Linf, + METRIC_Lp, + + METRIC_Canberra = 20, + METRIC_BrayCurtis, + METRIC_JensenShannon, + + METRIC_Cosine = 100, + METRIC_Correlation + }; + +} // namespace knn +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp new file mode 100644 index 0000000000..b37d99f82a --- /dev/null +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/brute_force_knn.hpp" + +#include +#include + +namespace raft { + namespace knn { + +/** + * Search the kNN for the k-nearest neighbors of a set of query vectors + * @param[in] input vector of device device memory array pointers to search + * @param[in] sizes vector of memory sizes for each device array pointer in input + * @param[in] D number of cols in input and search_items + * @param[in] search_items set of vectors to query for neighbors + * @param[in] n number of items in search_items + * @param[out] res_I pointer to device memory for returning k nearest indices + * @param[out] res_D pointer to device memory for returning k nearest distances + * @param[in] k number of neighbors to query + * @param[in] allocator the device memory allocator to use for temporary scratch memory + * @param[in] userStream the main cuda stream to use + * @param[in] internalStreams optional when n_params > 0, the index partitions can be + * queried in parallel using these streams. Note that n_int_streams also + * has to be > 0 for these to be used and their cardinality does not need + * to correspond to n_parts. + * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the + * user stream will be used. + * @param[in] rowMajorIndex are the index arrays in row-major layout? + * @param[in] rowMajorQuery are the query array in row-major layout? + * @param[in] translations translation ids for indices when index rows represent + * non-contiguous partitions + * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean) + * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm + * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root + */ +template +void brute_force_knn_impl(std::vector &input, std::vector &sizes, + IntType D, float *search_items, IntType n, int64_t *res_I, + float *res_D, IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, + cudaStream_t *internalStreams = nullptr, + int n_int_streams = 0, bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector *translations = nullptr, + MetricType metric = MetricType::METRIC_L2, + float metricArg = 2.0, bool expanded_form = false) { + + ASSERT(input.size() == sizes.size(), + "input and sizes vectors should be the same size"); + + faiss::MetricType m = detail::build_faiss_metric(metric); + + std::vector *id_ranges; + if (translations == nullptr) { + // If we don't have explicit translations + // for offsets of the indices, build them + // from the local partitions + id_ranges = new std::vector(); + int64_t total_n = 0; + for (int i = 0; i < input.size(); i++) { + id_ranges->push_back(total_n); + total_n += sizes[i]; + } + } else { + // otherwise, use the given translations + id_ranges = translations; + } + + // perform preprocessing + std::unique_ptr> query_metric_processor = + create_processor(metric, n, D, k, rowMajorQuery, userStream, + allocator); + query_metric_processor->preprocess(search_items); + + std::vector>> metric_processors( + input.size()); + for (int i = 0; i < input.size(); i++) { + metric_processors[i] = create_processor( + metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i]->preprocess(input[i]); + } + + int device; + CUDA_CHECK(cudaGetDevice(&device)); + + raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), + userStream); + + raft::mr::device::buffer all_D(allocator, userStream, 0); + raft::mr::devic::buffer all_I(allocator, userStream, 0); + + float *out_D = res_D; + int64_t *out_I = res_I; + + if (input.size() > 1) { + all_D.resize(input.size() * k * n, userStream); + all_I.resize(input.size() * k * n, userStream); + + out_D = all_D.data(); + out_I = all_I.data(); + } + + // Sync user stream only if using other streams to parallelize query + if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); + + for (int i = 0; i < input.size(); i++) { + faiss::gpu::StandardGpuResources gpu_res; + + cudaStream_t stream = + raft::select_stream(userStream, internalStreams, n_int_streams, i); + + gpu_res.noTempMemory(); + gpu_res.setCudaMallocWarning(false); + gpu_res.setDefaultStream(device, stream); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; + args.vectorsRowMajor = rowMajorIndex; + args.numVectors = sizes[i]; + args.queries = search_items; + args.queriesRowMajor = rowMajorQuery; + args.numQueries = n; + args.outDistances = out_D + (i * k * n); + args.outIndices = out_I + (i * k * n); + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); + + CUDA_CHECK(cudaPeekAtLastError()); + } + + // Sync internal streams if used. We don't need to + // sync the user stream because we'll already have + // fully serial execution. + for (int i = 0; i < n_int_streams; i++) { + CUDA_CHECK(cudaStreamSynchronize(internalStreams[i])); + } + + if (input.size() > 1 || translations != nullptr) { + // This is necessary for proper index translations. If there are + // no translations or partitions to combine, it can be skipped. + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, + trans.data()); + } + + // Perform necessary post-processing + if ((m == faiss::MetricType::METRIC_L2 || + m == faiss::MetricType::METRIC_Lp) && + !expanded_form) { + /** + * post-processing + */ + float p = 0.5; // standard l2 + if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg; + raft::linalg::unaryOp( + res_D, res_D, n * k, + [p] __device__(float input) { return powf(input, p); }, userStream); + } + + query_metric_processor->revert(search_items); + query_metric_processor->postprocess(out_D); + for (int i = 0; i < input.size(); i++) { + metric_processors[i]->revert(input[i]); + } + + if (translations == nullptr) delete id_ranges; +} + +/** + * @brief Flat C++ API function to perform a brute force knn on + * a series of input arrays and combine the results into a single + * output array for indexes and distances. + * + * @param[in] handle the cuml handle to use + * @param[in] input vector of pointers to the input arrays + * @param[in] sizes vector of sizes of input arrays + * @param[in] D the dimensionality of the arrays + * @param[in] search_items array of items to search of dimensionality D + * @param[in] n number of rows in search_items + * @param[out] res_I the resulting index array of size n * k + * @param[out] res_D the resulting distance array of size n * k + * @param[in] k the number of nearest neighbors to return + * @param[in] rowMajorIndex are the index arrays in row-major order? + * @param[in] rowMajorQuery are the query arrays in row-major order? + * @param[in] metric distance metric to use. Euclidean (L2) is used by + * default + * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This + * is ignored if the metric_type is not Minkowski. + * @param[in] expanded should lp-based distances be returned in their expanded + * form (e.g., without raising to the 1/p power). + */ +void brute_force_knn(raft::handle_t &handle, std::vector &input, + std::vector &sizes, int D, float *search_items, int n, + int64_t *res_I, float *res_D, int k, bool rowMajorIndex, + bool rowMajorQuery, MetricType metric, float metric_arg, + bool expanded) { + ASSERT(input.size() == sizes.size(), + "input and sizes vectors must be the same size"); + + std::vector int_streams = handle.get_internal_streams(); + + brute_force_knn_impl( + input, sizes, D, search_items, n, res_I, res_D, k, + handle.get_device_allocator(), handle.get_stream(), int_streams.data(), + handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr, + metric, metric_arg, expanded); +} + +} // namespace knn +} // namespace raft diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu new file mode 100644 index 0000000000..85780cfcf4 --- /dev/null +++ b/cpp/test/spatial/knn.cu @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft { + + struct KNNInputs { + std::vector> input; + int k; + }; + +template +class KNNTest : public ::testing::TestWithParam { + protected: + void testBruteForce() { + + brute_force_knn(handle_, + input_, + sizes_, + cols_, + search_data_, + rows_, + indices_, + distances_, + k_, + true, + true); + } + + void SetUp() override { + params = ::testing::TestWithParam::GetParam(); + rows_ = params.input.size(); + cols_ = params.input[0].size(); + k_ = params.k; + + float *input_d = rmm::device_buffer(params.input.data(), + params.input.size() * sizeof(float)); + + input_.push_back(input_d); + sizes_.push_back(rows_); + + raft::allocate(search_data_, row_ * cols_, true); + raft::allocate(indices_, + rows_ * cols_, + true); + raft::allocate(distances_, + rows_ * cols_, + true); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(search_data)); + CUDA_CHECK(cudaFree(search_labels)); + CUDA_CHECK(cudaFree(output_dists)); + CUDA_CHECK(cudaFree(output_indices)); + CUDA_CHECK(cudaFree(actual_labels)); + CUDA_CHECK(cudaFree(expected_labels)); + } + + private: + raft::handle_t handle_; + KNNInputs params_; + int rows_; + int cols_; + std::vector input_; + std::vector sizes_; + float *search_data_; + int64_t indices_; + float* distances_; + int k_; +}; + + +const std::vector inputs = { + // 2D + { + { + { 7.89611 , -6.3093657 }, + { 8.198494 , -6.6102095 }, + {-1.067701 , 0.2757877 }, + { 5.5629272, -4.0279684 }, + { 8.466168 , -6.3818727 }, + { 7.373038 , -3.2476108 }, + { 7.3618903, -6.311329 }, + { 3.5585778, 2.3175476 }, + { 8.722544 , -6.184722 }, + { 5.9165254, -4.0085735 }, + {-2.4502695, 1.8806121 }, + { 1.250205 , 1.6940732 }, + { 7.702861 , -5.5382366 }, + {-0.32521492, 1.0503006 }, + { 7.203165 , -6.1078873 }, + { 0.7067232, -0.02844107}, + {-0.6195269, 1.6659582 }, + { 7.3585844, -6.5425425 }, + { 0.2946735, 0.7920021 }, + { 5.9978905, -4.235259 }}, + 2}, +}; + +typedef KNNTest KNNTestF; +TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); } + +INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs)); + +} // namespace raft From c775437d3efb597bfc1c47034a914d2c59562f4b Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Thu, 7 Jan 2021 11:43:08 -0600 Subject: [PATCH 06/19] Fix compile errors --- .../spatial/knn/detail/brute_force_knn.hpp | 1 + .../raft/spatial/knn/detail/processing.hpp | 191 ++++++++++++++++++ cpp/include/raft/spatial/knn/knn.hpp | 7 +- 3 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 cpp/include/raft/spatial/knn/detail/processing.hpp diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index 16b295b3ca..29eccfd43b 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp new file mode 100644 index 0000000000..57c7e1050c --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -0,0 +1,191 @@ +/* + * Copyright (c)2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace knn { + +using deviceAllocator = raft::mr::device::allocator; +/** + * @brief A virtual class defining pre- and post-processing + * for metrics. This class will temporarily modify its given + * state in `preprocess()` and undo those modifications in + * `postprocess()` + */ + +template +class MetricProcessor { + public: + virtual void preprocess(math_t *data) {} + + virtual void revert(math_t *data) {} + + virtual void postprocess(math_t *data) {} + + virtual ~MetricProcessor() = default; +}; + +template +class CosineMetricProcessor : public MetricProcessor { + protected: + int k_; + bool row_major_; + size_t n_rows_; + size_t n_cols_; + cudaStream_t stream_; + std::shared_ptr device_allocator_; + raft::mr::device::buffer colsums_; + + public: + CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, + cudaStream_t stream, + std::shared_ptr allocator) + : device_allocator_(allocator), + stream_(stream), + colsums_(allocator, stream, n_rows), + n_cols_(n_cols), + n_rows_(n_rows), + row_major_(row_major), + k_(k) {} + + void preprocess(math_t *data) { + raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, + raft::linalg::NormType::L2Norm, row_major_, stream_, + [] __device__(math_t in) { return sqrtf(in); }); + + raft::linalg::matrixVectorOp( + data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, + stream_); + } + + void revert(math_t *data) { + raft::linalg::matrixVectorOp( + data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, + stream_); + } + + void postprocess(math_t *data) { + raft::linalg::unaryOp( + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, + stream_); + } + + ~CosineMetricProcessor() = default; +}; + +template +class CorrelationMetricProcessor : public CosineMetricProcessor { + using cosine = CosineMetricProcessor; + + public: + CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, + bool row_major, cudaStream_t stream, + std::shared_ptr allocator) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, + allocator), + means_(allocator, stream, n_rows) {} + + void preprocess(math_t *data) { + math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; + + raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, + (math_t)0.0, cosine::row_major_, true, + cosine::stream_); + + raft::linalg::unaryOp( + means_.data(), means_.data(), cosine::n_rows_, + [=] __device__(math_t in) { return in * normalizer_const; }, + cosine::stream_); + + raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, + cosine::n_rows_, cosine::row_major_, false, + cosine::stream_); + + CosineMetricProcessor::preprocess(data); + } + + void revert(math_t *data) { + CosineMetricProcessor::revert(data); + + raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, + cosine::n_rows_, cosine::row_major_, false, + cosine::stream_); + } + + void postprocess(math_t *data) { + CosineMetricProcessor::postprocess(data); + } + + ~CorrelationMetricProcessor() = default; + + raft::mr::device::buffer means_; +}; + +template +class DefaultMetricProcessor : public MetricProcessor { + public: + void preprocess(math_t *data) {} + + void revert(math_t *data) {} + + void postprocess(math_t *data) {} + + ~DefaultMetricProcessor() = default; +}; + +template +inline std::unique_ptr> create_processor( + MetricType metric, int n, int D, int k, bool rowMajorQuery, + cudaStream_t userStream, std::shared_ptr allocator) { + MetricProcessor *mp = nullptr; + + switch (metric) { + case MetricType::METRIC_Cosine: + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, + allocator); + break; + + case MetricType::METRIC_Correlation: + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, + userStream, allocator); + break; + default: + mp = new DefaultMetricProcessor(); + } + + return std::unique_ptr>(mp); +} + +// Currently only being used by floats +template class MetricProcessor; +template class CosineMetricProcessor; +template class CorrelationMetricProcessor; +template class DefaultMetricProcessor; + +}; // namespace knn +}; // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index b37d99f82a..9abafe6681 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -17,6 +17,7 @@ #pragma once #include "detail/brute_force_knn.hpp" +#include "detail/processing.hpp" #include #include @@ -24,6 +25,8 @@ namespace raft { namespace knn { +using deviceAllocator = raft::mr::device::allocator; + /** * Search the kNN for the k-nearest neighbors of a set of query vectors * @param[in] input vector of device device memory array pointers to search @@ -106,7 +109,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, userStream); raft::mr::device::buffer all_D(allocator, userStream, 0); - raft::mr::devic::buffer all_I(allocator, userStream, 0); + raft::mr::device::buffer all_I(allocator, userStream, 0); float *out_D = res_D; int64_t *out_I = res_I; @@ -166,7 +169,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, + detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); } From a22fddb1573ea3189e9c0823dde08f95f43ebbe7 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Fri, 8 Jan 2021 19:06:39 -0600 Subject: [PATCH 07/19] Add knn test --- cpp/CMakeLists.txt | 78 ++++---- .../spatial/knn/detail/brute_force_knn.hpp | 170 +++++++++++++++++ cpp/include/raft/spatial/knn/knn.hpp | 172 +----------------- cpp/test/spatial/knn.cu | 171 +++++++++++------ 4 files changed, 329 insertions(+), 262 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 691fbfdfd4..82d7c3d2f0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -248,45 +248,45 @@ if(BUILD_RAFT_TESTS) # keep the files in alphabetical order! add_executable(test_raft - test/cudart_utils.cpp - test/handle.cpp - test/integer_utils.cpp - test/lap/lap.cu - test/linalg/add.cu - test/linalg/binary_op.cu - test/linalg/cholesky_r1.cu - test/linalg/coalesced_reduction.cu - test/linalg/divide.cu - test/linalg/eig.cu - test/linalg/eig_sel.cu - test/linalg/gemm_layout.cu - test/linalg/map_then_reduce.cu - test/linalg/matrix_vector_op.cu - test/linalg/multiply.cu - test/linalg/norm.cu - test/linalg/reduce.cu - test/linalg/strided_reduction.cu - test/linalg/subtract.cu - test/linalg/svd.cu - test/linalg/transpose.cu - test/linalg/unary_op.cu - test/matrix/math.cu - test/matrix/matrix.cu - test/mr/device/buffer.cpp - test/mr/host/buffer.cpp - test/random/rng.cu - test/random/rng_int.cu - test/random/sample_without_replacement.cu - test/spatial/knn.cu - test/stats/mean.cu - test/stats/mean_center.cu - test/stats/stddev.cu - test/stats/sum.cu - test/test.cpp - test/spectral_matrix.cu - test/eigen_solvers.cu - test/cluster_solvers.cu - test/mst.cu) + # test/cudart_utils.cpp + #test/handle.cpp + #test/integer_utils.cpp + #test/lap/lap.cu + #test/linalg/add.cu + #test/linalg/binary_op.cu + #test/linalg/cholesky_r1.cu + #test/linalg/coalesced_reduction.cu + #test/linalg/divide.cu + #test/linalg/eig.cu + #test/linalg/eig_sel.cu + #test/linalg/gemm_layout.cu + #test/linalg/map_then_reduce.cu + #test/linalg/matrix_vector_op.cu + #test/linalg/multiply.cu + #test/linalg/norm.cu + #test/linalg/reduce.cu + #test/linalg/strided_reduction.cu + #test/linalg/subtract.cu + #test/linalg/svd.cu + #test/linalg/transpose.cu + #test/linalg/unary_op.cu + #test/matrix/math.cu + #test/matrix/matrix.cu + #test/mr/device/buffer.cpp + #test/mr/host/buffer.cpp + #test/random/rng.cu + #test/random/rng_int.cu + #test/random/sample_without_replacement.cu + test/spatial/knn.cu) + #test/stats/mean.cu + #test/stats/mean_center.cu + #test/stats/stddev.cu + #test/stats/sum.cu + #test/test.cpp + #test/spectral_matrix.cu + #test/eigen_solvers.cu + #test/cluster_solvers.cu + #test/mst.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index 29eccfd43b..432c3b2cf5 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -1,5 +1,6 @@ #pragma once +#include "processing.hpp" #include "utils.hpp" #include @@ -159,6 +160,175 @@ inline faiss::MetricType build_faiss_metric(MetricType metric) { } } +/** + * Search the kNN for the k-nearest neighbors of a set of query vectors + * @param[in] input vector of device device memory array pointers to search + * @param[in] sizes vector of memory sizes for each device array pointer in input + * @param[in] D number of cols in input and search_items + * @param[in] search_items set of vectors to query for neighbors + * @param[in] n number of items in search_items + * @param[out] res_I pointer to device memory for returning k nearest indices + * @param[out] res_D pointer to device memory for returning k nearest distances + * @param[in] k number of neighbors to query + * @param[in] allocator the device memory allocator to use for temporary scratch memory + * @param[in] userStream the main cuda stream to use + * @param[in] internalStreams optional when n_params > 0, the index partitions can be + * queried in parallel using these streams. Note that n_int_streams also + * has to be > 0 for these to be used and their cardinality does not need + * to correspond to n_parts. + * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the + * user stream will be used. + * @param[in] rowMajorIndex are the index arrays in row-major layout? + * @param[in] rowMajorQuery are the query array in row-major layout? + * @param[in] translations translation ids for indices when index rows represent + * non-contiguous partitions + * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean) + * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm + * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root + */ +template +void brute_force_knn_impl(std::vector &input, std::vector &sizes, + IntType D, float *search_items, IntType n, int64_t *res_I, + float *res_D, IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, + cudaStream_t *internalStreams = nullptr, + int n_int_streams = 0, bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector *translations = nullptr, + MetricType metric = MetricType::METRIC_L2, + float metricArg = 2.0, bool expanded_form = false) { + + ASSERT(input.size() == sizes.size(), + "input and sizes vectors should be the same size"); + + faiss::MetricType m = detail::build_faiss_metric(metric); + + std::vector *id_ranges; + if (translations == nullptr) { + // If we don't have explicit translations + // for offsets of the indices, build them + // from the local partitions + id_ranges = new std::vector(); + int64_t total_n = 0; + for (int i = 0; i < input.size(); i++) { + id_ranges->push_back(total_n); + total_n += sizes[i]; + } + } else { + // otherwise, use the given translations + id_ranges = translations; + } + + // perform preprocessing + std::unique_ptr> query_metric_processor = + create_processor(metric, n, D, k, rowMajorQuery, userStream, + allocator); + query_metric_processor->preprocess(search_items); + + std::vector>> metric_processors( + input.size()); + for (int i = 0; i < input.size(); i++) { + metric_processors[i] = create_processor( + metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i]->preprocess(input[i]); + } + + int device; + CUDA_CHECK(cudaGetDevice(&device)); + + raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), + userStream); + + raft::mr::device::buffer all_D(allocator, userStream, 0); + raft::mr::device::buffer all_I(allocator, userStream, 0); + + float *out_D = res_D; + int64_t *out_I = res_I; + + if (input.size() > 1) { + all_D.resize(input.size() * k * n, userStream); + all_I.resize(input.size() * k * n, userStream); + + out_D = all_D.data(); + out_I = all_I.data(); + } + + // Sync user stream only if using other streams to parallelize query + if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); + + for (int i = 0; i < input.size(); i++) { + faiss::gpu::StandardGpuResources gpu_res; + + cudaStream_t stream = + raft::select_stream(userStream, internalStreams, n_int_streams, i); + + gpu_res.noTempMemory(); + gpu_res.setCudaMallocWarning(false); + gpu_res.setDefaultStream(device, stream); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; + args.vectorsRowMajor = rowMajorIndex; + args.numVectors = sizes[i]; + args.queries = search_items; + args.queriesRowMajor = rowMajorQuery; + args.numQueries = n; + args.outDistances = out_D + (i * k * n); + args.outIndices = out_I + (i * k * n); + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); + + CUDA_CHECK(cudaPeekAtLastError()); + } + + // Sync internal streams if used. We don't need to + // sync the user stream because we'll already have + // fully serial execution. + for (int i = 0; i < n_int_streams; i++) { + CUDA_CHECK(cudaStreamSynchronize(internalStreams[i])); + } + + if (input.size() > 1 || translations != nullptr) { + // This is necessary for proper index translations. If there are + // no translations or partitions to combine, it can be skipped. + detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, + trans.data()); + } + + // Perform necessary post-processing + if ((m == faiss::MetricType::METRIC_L2 || + m == faiss::MetricType::METRIC_Lp) && + !expanded_form) { + /** + * post-processing + */ + float p = 0.5; // standard l2 + if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg; + raft::linalg::unaryOp( + res_D, res_D, n * k, + [p] __device__(float input) { return powf(input, p); }, userStream); + } + + query_metric_processor->revert(search_items); + query_metric_processor->postprocess(out_D); + for (int i = 0; i < input.size(); i++) { + metric_processors[i]->revert(input[i]); + } + + if (translations == nullptr) delete id_ranges; +} + } // namespace detail } // namespace knn } // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 9abafe6681..b6e3ed09ab 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -17,7 +17,6 @@ #pragma once #include "detail/brute_force_knn.hpp" -#include "detail/processing.hpp" #include #include @@ -27,175 +26,6 @@ namespace raft { using deviceAllocator = raft::mr::device::allocator; -/** - * Search the kNN for the k-nearest neighbors of a set of query vectors - * @param[in] input vector of device device memory array pointers to search - * @param[in] sizes vector of memory sizes for each device array pointer in input - * @param[in] D number of cols in input and search_items - * @param[in] search_items set of vectors to query for neighbors - * @param[in] n number of items in search_items - * @param[out] res_I pointer to device memory for returning k nearest indices - * @param[out] res_D pointer to device memory for returning k nearest distances - * @param[in] k number of neighbors to query - * @param[in] allocator the device memory allocator to use for temporary scratch memory - * @param[in] userStream the main cuda stream to use - * @param[in] internalStreams optional when n_params > 0, the index partitions can be - * queried in parallel using these streams. Note that n_int_streams also - * has to be > 0 for these to be used and their cardinality does not need - * to correspond to n_parts. - * @param[in] n_int_streams size of internalStreams. When this is <= 0, only the - * user stream will be used. - * @param[in] rowMajorIndex are the index arrays in row-major layout? - * @param[in] rowMajorQuery are the query array in row-major layout? - * @param[in] translations translation ids for indices when index rows represent - * non-contiguous partitions - * @param[in] metric corresponds to the FAISS::metricType enum (default is euclidean) - * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm - * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root - */ -template -void brute_force_knn_impl(std::vector &input, std::vector &sizes, - IntType D, float *search_items, IntType n, int64_t *res_I, - float *res_D, IntType k, - std::shared_ptr allocator, - cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - MetricType metric = MetricType::METRIC_L2, - float metricArg = 2.0, bool expanded_form = false) { - - ASSERT(input.size() == sizes.size(), - "input and sizes vectors should be the same size"); - - faiss::MetricType m = detail::build_faiss_metric(metric); - - std::vector *id_ranges; - if (translations == nullptr) { - // If we don't have explicit translations - // for offsets of the indices, build them - // from the local partitions - id_ranges = new std::vector(); - int64_t total_n = 0; - for (int i = 0; i < input.size(); i++) { - id_ranges->push_back(total_n); - total_n += sizes[i]; - } - } else { - // otherwise, use the given translations - id_ranges = translations; - } - - // perform preprocessing - std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); - query_metric_processor->preprocess(search_items); - - std::vector>> metric_processors( - input.size()); - for (int i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); - metric_processors[i]->preprocess(input[i]); - } - - int device; - CUDA_CHECK(cudaGetDevice(&device)); - - raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), - userStream); - - raft::mr::device::buffer all_D(allocator, userStream, 0); - raft::mr::device::buffer all_I(allocator, userStream, 0); - - float *out_D = res_D; - int64_t *out_I = res_I; - - if (input.size() > 1) { - all_D.resize(input.size() * k * n, userStream); - all_I.resize(input.size() * k * n, userStream); - - out_D = all_D.data(); - out_I = all_I.data(); - } - - // Sync user stream only if using other streams to parallelize query - if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); - - for (int i = 0; i < input.size(); i++) { - faiss::gpu::StandardGpuResources gpu_res; - - cudaStream_t stream = - raft::select_stream(userStream, internalStreams, n_int_streams, i); - - gpu_res.noTempMemory(); - gpu_res.setCudaMallocWarning(false); - gpu_res.setDefaultStream(device, stream); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; - args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; - args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_D + (i * k * n); - args.outIndices = out_I + (i * k * n); - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); - - CUDA_CHECK(cudaPeekAtLastError()); - } - - // Sync internal streams if used. We don't need to - // sync the user stream because we'll already have - // fully serial execution. - for (int i = 0; i < n_int_streams; i++) { - CUDA_CHECK(cudaStreamSynchronize(internalStreams[i])); - } - - if (input.size() > 1 || translations != nullptr) { - // This is necessary for proper index translations. If there are - // no translations or partitions to combine, it can be skipped. - detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); - } - - // Perform necessary post-processing - if ((m == faiss::MetricType::METRIC_L2 || - m == faiss::MetricType::METRIC_Lp) && - !expanded_form) { - /** - * post-processing - */ - float p = 0.5; // standard l2 - if (m == faiss::MetricType::METRIC_Lp) p = 1.0 / metricArg; - raft::linalg::unaryOp( - res_D, res_D, n * k, - [p] __device__(float input) { return powf(input, p); }, userStream); - } - - query_metric_processor->revert(search_items); - query_metric_processor->postprocess(out_D); - for (int i = 0; i < input.size(); i++) { - metric_processors[i]->revert(input[i]); - } - - if (translations == nullptr) delete id_ranges; -} - /** * @brief Flat C++ API function to perform a brute force knn on * a series of input arrays and combine the results into a single @@ -229,7 +59,7 @@ void brute_force_knn(raft::handle_t &handle, std::vector &input, std::vector int_streams = handle.get_internal_streams(); - brute_force_knn_impl( + detail::brute_force_knn_impl( input, sizes, D, search_items, n, res_I, res_D, k, handle.get_device_allocator(), handle.get_stream(), int_streams.data(), handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, nullptr, diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 85780cfcf4..bfacc4935e 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -17,23 +17,63 @@ #include #include #include +#include #include namespace raft { - +namespace knn { struct KNNInputs { std::vector> input; int k; + std::vector labels; }; +__global__ void build_actual_output(int *output, int n_rows, int k, + const int *idx_labels, + const int64_t *indices) { + int element = threadIdx.x + blockDim.x * blockIdx.x; + if (element >= n_rows * k) return; + + int ind = (int)indices[element]; + output[element] = idx_labels[ind]; +} + +__global__ void build_expected_output(int *output, int n_rows, int k, + const int *labels) { + int row = threadIdx.x + blockDim.x * blockIdx.x; + if (row >= n_rows) return; + + int cur_label = labels[row]; + for (int i = 0; i < k; i++) { + output[row * k + i] = cur_label; + } +} + template class KNNTest : public ::testing::TestWithParam { protected: void testBruteForce() { + raft::print_device_vector("Input array: ", input_, + rows_ * cols_, std::cout); + std::cout << "K: " << k_ << "\n"; + raft::print_device_vector("Labels array: ", search_labels_, + rows_, std::cout); + + auto stream = handle_.get_stream(); + + raft::allocate(actual_labels_, rows_ * k_, true); + raft::allocate(expected_labels_, rows_ * k_, true); + + std::vector input_vec; + std::vector sizes_vec; + input_vec.push_back(input_); + sizes_vec.push_back(rows_); + + brute_force_knn(handle_, - input_, - sizes_, + input_vec, + sizes_vec, cols_, search_data_, rows_, @@ -41,37 +81,67 @@ class KNNTest : public ::testing::TestWithParam { distances_, k_, true, - true); + true, + MetricType::METRIC_L2, + 0.0, + false); + + build_actual_output<<>>( + actual_labels_, rows_, + k_, search_labels_, + indices_); + + build_expected_output<<>>( + expected_labels_, rows_, + k_, search_labels_); + + raft::print_device_vector("Output indices: ", indices_, + rows_ * k_, std::cout); + raft::print_device_vector("Output distances: ", distances_, + rows_ * k_, std::cout); + raft::print_device_vector("Output labels: ", actual_labels_, + rows_ * k_, std::cout); + raft::print_device_vector("Expected labels: ", expected_labels_, + rows_ * k_, std::cout); } void SetUp() override { - params = ::testing::TestWithParam::GetParam(); - rows_ = params.input.size(); - cols_ = params.input[0].size(); - k_ = params.k; - - float *input_d = rmm::device_buffer(params.input.data(), - params.input.size() * sizeof(float)); - - input_.push_back(input_d); - sizes_.push_back(rows_); - - raft::allocate(search_data_, row_ * cols_, true); - raft::allocate(indices_, - rows_ * cols_, - true); - raft::allocate(distances_, - rows_ * cols_, - true); + params_ = ::testing::TestWithParam::GetParam(); + rows_ = params_.input.size(); + cols_ = params_.input[0].size(); + k_ = params_.k; + + std::vector row_major_input; + for (int i = 0; i < params_.input.size(); ++i) { + for (int j = 0; j < params_.input[i].size(); ++j) { + row_major_input.push_back(params_.input[i][j]); + } + } + rmm::device_buffer input_d = rmm::device_buffer(row_major_input.data(), + row_major_input.size() * sizeof(float)); + float *input_ptr = static_cast(input_d.data()); + + rmm::device_buffer labels_d = rmm::device_buffer(params_.labels.data(), + params_.labels.size() * sizeof(int)); + int *labels_ptr = static_cast(labels_d.data()); + + raft::allocate(input_, rows_ * cols_, true); + raft::allocate(search_data_, rows_ * cols_, true); + raft::allocate(search_data_, rows_ * cols_, true); + raft::allocate(indices_, rows_ * k_, true); + raft::allocate(distances_, rows_ * k_, true); + raft::allocate(search_labels_, rows_, true); + + raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream()); + raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream()); + raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); } void TearDown() override { - CUDA_CHECK(cudaFree(search_data)); - CUDA_CHECK(cudaFree(search_labels)); - CUDA_CHECK(cudaFree(output_dists)); - CUDA_CHECK(cudaFree(output_indices)); - CUDA_CHECK(cudaFree(actual_labels)); - CUDA_CHECK(cudaFree(expected_labels)); + CUDA_CHECK(cudaFree(search_data_)); + CUDA_CHECK(cudaFree(indices_)); + CUDA_CHECK(cudaFree(distances_)); + CUDA_CHECK(cudaFree(actual_labels_)); } private: @@ -79,12 +149,15 @@ class KNNTest : public ::testing::TestWithParam { KNNInputs params_; int rows_; int cols_; - std::vector input_; - std::vector sizes_; + float *input_; float *search_data_; - int64_t indices_; + int64_t *indices_; float* distances_; int k_; + + int *search_labels_; + int *actual_labels_; + int *expected_labels_; }; @@ -92,27 +165,20 @@ const std::vector inputs = { // 2D { { - { 7.89611 , -6.3093657 }, - { 8.198494 , -6.6102095 }, - {-1.067701 , 0.2757877 }, - { 5.5629272, -4.0279684 }, - { 8.466168 , -6.3818727 }, - { 7.373038 , -3.2476108 }, - { 7.3618903, -6.311329 }, - { 3.5585778, 2.3175476 }, - { 8.722544 , -6.184722 }, - { 5.9165254, -4.0085735 }, - {-2.4502695, 1.8806121 }, - { 1.250205 , 1.6940732 }, - { 7.702861 , -5.5382366 }, - {-0.32521492, 1.0503006 }, - { 7.203165 , -6.1078873 }, - { 0.7067232, -0.02844107}, - {-0.6195269, 1.6659582 }, - { 7.3585844, -6.5425425 }, - { 0.2946735, 0.7920021 }, - { 5.9978905, -4.235259 }}, - 2}, + { 2.7810836,2.550537003 }, + { 1.465489372,2.362125076}, + { 3.396561688,4.400293529}, + { 1.38807019,1.850220317 }, + { 3.06407232,3.005305973 }, + { 7.627531214,2.759262235 }, + { 5.332441248,2.088626775 }, + { 6.922596716,1.77106367 }, + { 8.675418651,-0.242068655}, + {7.673756466,3.508563011 }, + }, + 2, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1} + } }; typedef KNNTest KNNTestF; @@ -120,4 +186,5 @@ TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); } INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs)); +} // namespace knn } // namespace raft From 418417fdc3fc6a70d7400fb1ba6c9b237fc492d5 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 13 Jan 2021 17:28:57 -0600 Subject: [PATCH 08/19] Find faiss in conda env --- cpp/CMakeLists.txt | 28 ++++---- cpp/cmake/Dependencies.cmake | 7 +- cpp/cmake/templates/CMakeLists.txt | 51 --------------- cpp/cmake/templates/Findpkg.cmake.in | 98 ---------------------------- 4 files changed, 18 insertions(+), 166 deletions(-) delete mode 100644 cpp/cmake/templates/CMakeLists.txt delete mode 100644 cpp/cmake/templates/Findpkg.cmake.in diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 82d7c3d2f0..6e4530a707 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,7 +14,6 @@ # limitations under the License. #============================================================================= - cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(RAFT VERSION 0.18.0 LANGUAGES CXX CUDA) @@ -67,18 +66,6 @@ set(GPU_ARCHS "" CACHE STRING ############################################################################## # - Requirements ------------------------------------------------------------- -# Create FindPackage.cmake files to use find(package) functionality for -# dependencies - -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates) - -if(NOT BUILD_STATIC_FAISS) - GENERATE_FIND_MODULE( - NAME FAISS - HEADER_NAME faiss/IndexFlat.h - LIBRARY_NAME faiss) -endif(NOT BUILD_STATIC_FAISS) - set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) find_package(CUDA 10.0 REQUIRED) @@ -189,6 +176,18 @@ endif() include(cmake/Dependencies.cmake) include(cmake/comms.cmake) +################################################################################################### +# - FAISS ------------------------------------------------------------------------------------------- + +if(NOT BUILD_STATIC_FAISS) + find_path(FAISS_INCLUDE_DIRS "faiss" + HINTS + "$ENV{FAISS_ROOT}/include" + "$ENV{CONDA_PREFIX}/include/faiss" + "$ENV{CONDA_PREFIX}/include") +endif(NOT BUILD_STATIC_FAISS) +message(STATUS "FAISS: FAISS_INCLUDE_DIRS set to ${FAISS_INCLUDE_DIRS}") + ################################################################################################### # - RMM ------------------------------------------------------------------------------------------- @@ -226,7 +225,6 @@ endif(DEFINED ENV{CONDA_PREFIX}) # - libraries ---------------------------------------------------------------- set(RAFT_LINK_LIBRARIES - FAISS::FAISS ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} @@ -234,6 +232,7 @@ set(RAFT_LINK_LIBRARIES ${CUDA_curand_LIBRARY}) set(RAFT_LINK_DIRECTORIES + ${FAISS_INCLUDE_DIRS} ${RMM_INCLUDE_DIRS}) if(DEFINED ENV{CONDA_PREFIX}) @@ -300,6 +299,7 @@ if(BUILD_RAFT_TESTS) target_link_libraries(test_raft PRIVATE ${RAFT_LINK_LIBRARIES} + FAISS::FAISS GTest::GTest GTest::Main OpenMP::OpenMP_CXX diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake index c5dd705003..080efb5b1f 100644 --- a/cpp/cmake/Dependencies.cmake +++ b/cpp/cmake/Dependencies.cmake @@ -66,8 +66,10 @@ if(BUILD_STATIC_FAISS) # set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss") set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src") else() - set(FAISS_INSTALL_DIR ENV{FAISS_ROOT}) - find_package(FAISS REQUIRED) + add_library(FAISS::FAISS SHARED IMPORTED) + set_property(TARGET FAISS::FAISS PROPERTY + IMPORTED_LOCATION $ENV{CONDA_PREFIX}/lib/libfaiss.so) + message(STATUS "Found FAISS: $ENV{CONDA_PREFIX}/lib/libfaiss.so") endif(BUILD_STATIC_FAISS) ############################################################################## @@ -106,5 +108,4 @@ endif(BUILD_GTEST) if(NOT CUB_IS_PART_OF_CTK) add_dependencies(GTest::GTest cub) endif(NOT CUB_IS_PART_OF_CTK) -add_dependencies(FAISS::FAISS benchmark) add_dependencies(FAISS::FAISS faiss) diff --git a/cpp/cmake/templates/CMakeLists.txt b/cpp/cmake/templates/CMakeLists.txt deleted file mode 100644 index 5bf20c9061..0000000000 --- a/cpp/cmake/templates/CMakeLists.txt +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -# File to generate Find*.cmake modules to find packages. -# This file uses Findpkd.cmake.in to generate modules so that we can use -# find(package) in the main CMake for any package we want. -# -# To use, include in CMakeLists.txt this folder, and then use the function -# GENERATE_FIND_MODULE, which takes the following parameters: -# NAME: -# Name of the package to find in find(NAME) -# HEADER_NAME: -# Name of header file to use to find include dirs path of the package. -# LIBRARY_NAME: -# (Optional) Name of library to find to find include lib path of the package. -# Assumed to be lib${NAME} if left empty. -# LOCATION: -# (Optional) Name of additional folder to look for headers/lib files. -# Useful if one wants to reduce size of #include commands -# VERSION: -# (Optional) Version of the package. Useful to find libraries that append -# version number to their filename (for example libopenblas.so.0) - - -function(GENERATE_FIND_MODULE) - set(oneValueArgs NAME LOCATION VERSION) - set(multiValueArgs HEADER_NAME LIBRARY_NAME) - cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN} ) - - if(NOT DEFINED PKG_LIBRARY_NAME) - set(PKG_LIBRARY_NAME ${PKG_NAME}) - endif(NOT DEFINED PKG_LIBRARY_NAME) - - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/Findpkg.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/cmake/Find${PKG_NAME}.cmake - @ONLY) -endfunction() - diff --git a/cpp/cmake/templates/Findpkg.cmake.in b/cpp/cmake/templates/Findpkg.cmake.in deleted file mode 100644 index a81f167d62..0000000000 --- a/cpp/cmake/templates/Findpkg.cmake.in +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Based on FindPNG.cmake from cmake 3.14.3 - -#[=======================================================================[.rst: -Find@PKG_NAME@ --------- - -Template to generate FindPKG_NAME.cmake CMake modules - -Find @PKG_NAME@ - -Imported targets -^^^^^^^^^^^^^^^^ - -This module defines the following :prop_tgt:`IMPORTED` target: - -``@PKG_NAME@::@PKG_NAME@`` - The lib@PKG_NAME@ library, if found. - -Result variables -^^^^^^^^^^^^^^^^ - -This module will set the following variables in your project: - -``@PKG_NAME@_INCLUDE_DIRS`` - where to find @PKG_NAME@.hpp , etc. -``@PKG_NAME@_LIBRARIES`` - the libraries to link against to use lib@PKG_NAME@. -``@PKG_NAME@_FOUND`` - If false, do not try to use @PKG_NAME@. -``@PKG_NAME@_VERSION_STRING`` - the version of the @PKG_NAME@ library found - -#]=======================================================================] - -find_path(@PKG_NAME@_LOCATION @PKG_HEADER_NAME@ - HINTS ${@PKG_NAME@_INSTALL_DIR} - PATH_SUFFIXES include include/@PKG_LOCATION@) - -list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@ lib@PKG_LIBRARY_NAME@) -set(_@PKG_NAME@_VERSION_SUFFIXES @PKG_VERSION@) - -foreach(v IN LISTS _@PKG_NAME@_VERSION_SUFFIXES) - list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@${v} lib@PKG_LIBRARY_NAME@${v}) - list(APPEND @PKG_NAME@_NAMES @PKG_LIBRARY_NAME@.${v} lib@PKG_LIBRARY_NAME@.${v}) -endforeach() -unset(_@PKG_NAME@_VERSION_SUFFIXES) - -find_library(@PKG_NAME@_LIBRARY_RELEASE NAMES ${@PKG_NAME@_NAMES} - HINTS ${@PKG_NAME@_INSTALL_DIR} - PATH_SUFFIXES lib) - -include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake) -select_library_configurations(@PKG_NAME@) -mark_as_advanced(@PKG_NAME@_LIBRARY_RELEASE) -unset(@PKG_NAME@_NAMES) - -# Set by select_library_configurations(), but we want the one from -# find_package_handle_standard_args() below. -unset(@PKG_NAME@_FOUND) - -if (@PKG_NAME@_LIBRARY AND @PKG_NAME@_LOCATION) - set(@PKG_NAME@_INCLUDE_DIRS ${@PKG_NAME@_LOCATION} ) - set(@PKG_NAME@_LIBRARY ${@PKG_NAME@_LIBRARY}) - - if(NOT TARGET @PKG_NAME@::@PKG_NAME@) - add_library(@PKG_NAME@::@PKG_NAME@ UNKNOWN IMPORTED) - set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${@PKG_NAME@_INCLUDE_DIRS}") - if(EXISTS "${@PKG_NAME@_LIBRARY}") - set_target_properties(@PKG_NAME@::@PKG_NAME@ PROPERTIES - IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${@PKG_NAME@_LIBRARY}") - endif() - endif() -endif () - - -include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) -find_package_handle_standard_args(@PKG_NAME@ - REQUIRED_VARS @PKG_NAME@_LIBRARY @PKG_NAME@_LOCATION - VERSION_VAR @PKG_NAME@_VERSION_STRING) - -mark_as_advanced(@PKG_NAME@_LOCATION @PKG_NAME@_LIBRARY) From b47311168a5affac138401a2f0c6d9a683b4f2d1 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 13 Jan 2021 20:17:27 -0600 Subject: [PATCH 09/19] Remove warning in loop --- cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index 432c3b2cf5..0e7206edca 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -211,7 +211,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // from the local partitions id_ranges = new std::vector(); int64_t total_n = 0; - for (int i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); total_n += sizes[i]; } @@ -228,7 +228,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, std::vector>> metric_processors( input.size()); - for (int i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) { metric_processors[i] = create_processor( metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); metric_processors[i]->preprocess(input[i]); @@ -258,7 +258,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // Sync user stream only if using other streams to parallelize query if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); - for (int i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) { faiss::gpu::StandardGpuResources gpu_res; cudaStream_t stream = @@ -322,7 +322,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, query_metric_processor->revert(search_items); query_metric_processor->postprocess(out_D); - for (int i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) { metric_processors[i]->revert(input[i]); } From cfbe32a4759b58f63c9ffcc6640d6ae23b5c6255 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Fri, 15 Jan 2021 19:19:07 -0600 Subject: [PATCH 10/19] Update to raft distance type --- .../spatial/knn/detail/brute_force_knn.hpp | 26 ++++++++----- .../raft/spatial/knn/detail/processing.hpp | 9 ++--- cpp/include/raft/spatial/knn/detail/utils.hpp | 39 ------------------- cpp/include/raft/spatial/knn/knn.hpp | 7 ++-- cpp/test/spatial/knn.cu | 7 +--- 5 files changed, 26 insertions(+), 62 deletions(-) delete mode 100644 cpp/include/raft/spatial/knn/detail/utils.hpp diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index 0e7206edca..b94bab57af 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -1,8 +1,5 @@ #pragma once -#include "processing.hpp" -#include "utils.hpp" - #include #include @@ -16,9 +13,12 @@ #include #include #include +#include #include #include +#include "processing.hpp" + namespace raft { namespace knn { namespace detail { @@ -149,14 +149,20 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); } -inline faiss::MetricType build_faiss_metric(MetricType metric) { +inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) { switch (metric) { - case MetricType::METRIC_Cosine: - return faiss::MetricType::METRIC_INNER_PRODUCT; - case MetricType::METRIC_Correlation: - return faiss::MetricType::METRIC_INNER_PRODUCT; + case distance::DistanceType::EucUnexpandedL2: + return faiss::MetricType::METRIC_L2; + case distance::DistanceType::EucUnexpandedL1: + return faiss::MetricType::METRIC_L1; + case distance::DistanceType::ChebyChev: + return faiss::MetricType::METRIC_Linf; + case distance::DistanceType::Minkowski: + return faiss::MetricType::METRIC_Lp; + case distance::DistanceType::Canberra: + return faiss::MetricType::METRIC_Canberra; default: - return (faiss::MetricType)metric; + return faiss::MetricType::METRIC_INNER_PRODUCT; } } @@ -196,7 +202,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, int n_int_streams = 0, bool rowMajorIndex = true, bool rowMajorQuery = true, std::vector *translations = nullptr, - MetricType metric = MetricType::METRIC_L2, + distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2, float metricArg = 2.0, bool expanded_form = false) { ASSERT(input.size() == sizes.size(), diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 57c7e1050c..ca779f45c3 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -15,8 +15,7 @@ */ #pragma once -#include "utils.hpp" - +#include #include #include #include @@ -160,17 +159,17 @@ class DefaultMetricProcessor : public MetricProcessor { template inline std::unique_ptr> create_processor( - MetricType metric, int n, int D, int k, bool rowMajorQuery, + distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream, std::shared_ptr allocator) { MetricProcessor *mp = nullptr; switch (metric) { - case MetricType::METRIC_Cosine: + case distance::DistanceType::EucExpandedCosine: mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; - case MetricType::METRIC_Correlation: + case distance::DistanceType::Correlation: mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; diff --git a/cpp/include/raft/spatial/knn/detail/utils.hpp b/cpp/include/raft/spatial/knn/detail/utils.hpp deleted file mode 100644 index c8542772f7..0000000000 --- a/cpp/include/raft/spatial/knn/detail/utils.hpp +++ /dev/null @@ -1,39 +0,0 @@ - -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace raft { -namespace knn { - - enum MetricType { - METRIC_INNER_PRODUCT = 0, - METRIC_L2, - METRIC_L1, - METRIC_Linf, - METRIC_Lp, - - METRIC_Canberra = 20, - METRIC_BrayCurtis, - METRIC_JensenShannon, - - METRIC_Cosine = 100, - METRIC_Correlation - }; - -} // namespace knn -} // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index b6e3ed09ab..2a438c2f69 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -51,9 +51,10 @@ using deviceAllocator = raft::mr::device::allocator; */ void brute_force_knn(raft::handle_t &handle, std::vector &input, std::vector &sizes, int D, float *search_items, int n, - int64_t *res_I, float *res_D, int k, bool rowMajorIndex, - bool rowMajorQuery, MetricType metric, float metric_arg, - bool expanded) { + int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false, + bool rowMajorQuery = false, + distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2, + float metric_arg = 2.0f, bool expanded = false) { ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index bfacc4935e..680d641f27 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft { namespace knn { @@ -81,10 +82,7 @@ class KNNTest : public ::testing::TestWithParam { distances_, k_, true, - true, - MetricType::METRIC_L2, - 0.0, - false); + true); build_actual_output<<>>( actual_labels_, rows_, @@ -127,7 +125,6 @@ class KNNTest : public ::testing::TestWithParam { raft::allocate(input_, rows_ * cols_, true); raft::allocate(search_data_, rows_ * cols_, true); - raft::allocate(search_data_, rows_ * cols_, true); raft::allocate(indices_, rows_ * k_, true); raft::allocate(distances_, rows_ * k_, true); raft::allocate(search_labels_, rows_, true); From e480951f69ca2bb28fd600d52e6f8aa6551edb58 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Tue, 26 Jan 2021 15:19:12 -0600 Subject: [PATCH 11/19] Update distances and add assert in test --- .../raft/spatial/knn/detail/brute_force_knn.hpp | 14 +++++++++----- cpp/include/raft/spatial/knn/detail/processing.hpp | 4 ++-- cpp/include/raft/spatial/knn/knn.hpp | 2 +- cpp/test/spatial/knn.cu | 11 ++++++++--- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index b94bab57af..ce86b2ce77 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -151,16 +151,20 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) { switch (metric) { - case distance::DistanceType::EucUnexpandedL2: + case distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; - case distance::DistanceType::EucUnexpandedL1: + case distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; - case distance::DistanceType::ChebyChev: + case distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; - case distance::DistanceType::Minkowski: + case distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; case distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; + case distance::DistanceType::BrayCurtis: + return faiss::MetricType::METRIC_BrayCurtis; + case distance::DistanceType::JensenShannon: + return faiss::MetricType::METRIC_JensenShannon; default: return faiss::MetricType::METRIC_INNER_PRODUCT; } @@ -202,7 +206,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, int n_int_streams = 0, bool rowMajorIndex = true, bool rowMajorQuery = true, std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, float metricArg = 2.0, bool expanded_form = false) { ASSERT(input.size() == sizes.size(), diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index ca779f45c3..60f6b13800 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -164,12 +164,12 @@ inline std::unique_ptr> create_processor( MetricProcessor *mp = nullptr; switch (metric) { - case distance::DistanceType::EucExpandedCosine: + case distance::DistanceType::CosineExpanded: mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; - case distance::DistanceType::Correlation: + case distance::DistanceType::CorrelationExpanded: mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 2a438c2f69..556ddba86e 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -53,7 +53,7 @@ void brute_force_knn(raft::handle_t &handle, std::vector &input, std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false, bool rowMajorQuery = false, - distance::DistanceType metric = distance::DistanceType::EucUnexpandedL2, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, float metric_arg = 2.0f, bool expanded = false) { ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 680d641f27..27ff4a19d9 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -16,10 +16,11 @@ #include #include -#include -#include -#include #include +#include +#include +#include +#include "../test_utils.h" namespace raft { namespace knn { @@ -101,6 +102,10 @@ class KNNTest : public ::testing::TestWithParam { rows_ * k_, std::cout); raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout); + + ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, + rows_ * k_, + raft::Compare())); } void SetUp() override { From 0ba85770c76bde649a16dfbb47d45f5429f3a2c7 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 27 Jan 2021 12:43:27 -0600 Subject: [PATCH 12/19] Clean pr --- cpp/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6e4530a707..5db1504e57 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -66,9 +66,10 @@ set(GPU_ARCHS "" CACHE STRING ############################################################################## # - Requirements ------------------------------------------------------------- +find_package(CUDA 10.0 REQUIRED) + set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -find_package(CUDA 10.0 REQUIRED) ############################################################################## # - Compiler Options -------------------------------------------------------- From 63a634c890f7bd67ecf2763ce0f5a0293b0e60e0 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 27 Jan 2021 12:44:13 -0600 Subject: [PATCH 13/19] Clean pr --- cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5db1504e57..283ae2f83a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -70,7 +70,6 @@ find_package(CUDA 10.0 REQUIRED) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - ############################################################################## # - Compiler Options -------------------------------------------------------- From f3c5f2b43cae43925040cf79906162d0f0e31ec7 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 27 Jan 2021 12:45:00 -0600 Subject: [PATCH 14/19] Clean cmake --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 283ae2f83a..fe9f927348 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,6 +14,7 @@ # limitations under the License. #============================================================================= + cmake_minimum_required(VERSION 3.14 FATAL_ERROR) project(RAFT VERSION 0.18.0 LANGUAGES CXX CUDA) From b7c90960ef90d65fb91793ca07f4f555a8c93e25 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 27 Jan 2021 12:47:13 -0600 Subject: [PATCH 15/19] Uncomment tests --- cpp/CMakeLists.txt | 78 +++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fe9f927348..2c39b30881 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -248,45 +248,45 @@ if(BUILD_RAFT_TESTS) # keep the files in alphabetical order! add_executable(test_raft - # test/cudart_utils.cpp - #test/handle.cpp - #test/integer_utils.cpp - #test/lap/lap.cu - #test/linalg/add.cu - #test/linalg/binary_op.cu - #test/linalg/cholesky_r1.cu - #test/linalg/coalesced_reduction.cu - #test/linalg/divide.cu - #test/linalg/eig.cu - #test/linalg/eig_sel.cu - #test/linalg/gemm_layout.cu - #test/linalg/map_then_reduce.cu - #test/linalg/matrix_vector_op.cu - #test/linalg/multiply.cu - #test/linalg/norm.cu - #test/linalg/reduce.cu - #test/linalg/strided_reduction.cu - #test/linalg/subtract.cu - #test/linalg/svd.cu - #test/linalg/transpose.cu - #test/linalg/unary_op.cu - #test/matrix/math.cu - #test/matrix/matrix.cu - #test/mr/device/buffer.cpp - #test/mr/host/buffer.cpp - #test/random/rng.cu - #test/random/rng_int.cu - #test/random/sample_without_replacement.cu - test/spatial/knn.cu) - #test/stats/mean.cu - #test/stats/mean_center.cu - #test/stats/stddev.cu - #test/stats/sum.cu - #test/test.cpp - #test/spectral_matrix.cu - #test/eigen_solvers.cu - #test/cluster_solvers.cu - #test/mst.cu) + test/cudart_utils.cpp + test/handle.cpp + test/integer_utils.cpp + test/lap/lap.cu + test/linalg/add.cu + test/linalg/binary_op.cu + test/linalg/cholesky_r1.cu + test/linalg/coalesced_reduction.cu + test/linalg/divide.cu + test/linalg/eig.cu + test/linalg/eig_sel.cu + test/linalg/gemm_layout.cu + test/linalg/map_then_reduce.cu + test/linalg/matrix_vector_op.cu + test/linalg/multiply.cu + test/linalg/norm.cu + test/linalg/reduce.cu + test/linalg/strided_reduction.cu + test/linalg/subtract.cu + test/linalg/svd.cu + test/linalg/transpose.cu + test/linalg/unary_op.cu + test/matrix/math.cu + test/matrix/matrix.cu + test/mr/device/buffer.cpp + test/mr/host/buffer.cpp + test/random/rng.cu + test/random/rng_int.cu + test/random/sample_without_replacement.cu + test/spatial/knn.cu + test/stats/mean.cu + test/stats/mean_center.cu + test/stats/stddev.cu + test/stats/sum.cu + test/test.cpp + test/spectral_matrix.cu + test/eigen_solvers.cu + test/cluster_solvers.cu + test/mst.cu) target_include_directories(test_raft PRIVATE From c247f250bd85e8127d9c1d95f7052a6dc5b2cf62 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Wed, 27 Jan 2021 17:52:09 -0600 Subject: [PATCH 16/19] Clang format --- .../spatial/knn/detail/brute_force_knn.hpp | 40 +++---- .../raft/spatial/knn/detail/processing.hpp | 4 +- cpp/include/raft/spatial/knn/knn.hpp | 18 +-- cpp/test/spatial/knn.cu | 113 ++++++++---------- 4 files changed, 77 insertions(+), 98 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index ce86b2ce77..caf79265c6 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -10,11 +10,11 @@ #include #include +#include #include #include -#include -#include #include +#include #include #include "processing.hpp" @@ -197,19 +197,16 @@ inline faiss::MetricType build_faiss_metric(distance::DistanceType metric) { * @param[in] expanded_form whether or not lp variants should be reduced w/ lp-root */ template -void brute_force_knn_impl(std::vector &input, std::vector &sizes, - IntType D, float *search_items, IntType n, int64_t *res_I, - float *res_D, IntType k, - std::shared_ptr allocator, - cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - float metricArg = 2.0, bool expanded_form = false) { - - ASSERT(input.size() == sizes.size(), +void brute_force_knn_impl( + std::vector &input, std::vector &sizes, IntType D, + float *search_items, IntType n, int64_t *res_I, float *res_D, IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, cudaStream_t *internalStreams = nullptr, + int n_int_streams = 0, bool rowMajorIndex = true, bool rowMajorQuery = true, + std::vector *translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + float metricArg = 2.0, bool expanded_form = false) { + ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); faiss::MetricType m = detail::build_faiss_metric(metric); @@ -247,7 +244,8 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); + raft::mr::device::buffer trans(allocator, userStream, + id_ranges->size()); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); @@ -312,8 +310,8 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); + detail::knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, + userStream, trans.data()); } // Perform necessary post-processing @@ -339,6 +337,6 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (translations == nullptr) delete id_ranges; } -} // namespace detail -} // namespace knn -} // namespace raft +} // namespace detail +} // namespace knn +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 60f6b13800..1b6589100f 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -19,10 +19,10 @@ #include #include #include -#include -#include #include #include +#include +#include namespace raft { namespace knn { diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 556ddba86e..668f5291e1 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -22,7 +22,7 @@ #include namespace raft { - namespace knn { +namespace knn { using deviceAllocator = raft::mr::device::allocator; @@ -49,12 +49,12 @@ using deviceAllocator = raft::mr::device::allocator; * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -void brute_force_knn(raft::handle_t &handle, std::vector &input, - std::vector &sizes, int D, float *search_items, int n, - int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false, - bool rowMajorQuery = false, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - float metric_arg = 2.0f, bool expanded = false) { +void brute_force_knn( + raft::handle_t &handle, std::vector &input, std::vector &sizes, + int D, float *search_items, int n, int64_t *res_I, float *res_D, int k, + bool rowMajorIndex = false, bool rowMajorQuery = false, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + float metric_arg = 2.0f, bool expanded = false) { ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); @@ -67,5 +67,5 @@ void brute_force_knn(raft::handle_t &handle, std::vector &input, metric, metric_arg, expanded); } -} // namespace knn -} // namespace raft +} // namespace knn +} // namespace raft diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 27ff4a19d9..37bb859a53 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include #include #include @@ -24,11 +24,11 @@ namespace raft { namespace knn { - struct KNNInputs { - std::vector> input; - int k; - std::vector labels; - }; +struct KNNInputs { + std::vector> input; + int k; + std::vector labels; +}; __global__ void build_actual_output(int *output, int n_rows, int k, const int *idx_labels, @@ -55,12 +55,11 @@ template class KNNTest : public ::testing::TestWithParam { protected: void testBruteForce() { - - raft::print_device_vector("Input array: ", input_, - rows_ * cols_, std::cout); + raft::print_device_vector("Input array: ", input_, rows_ * cols_, + std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_, - rows_, std::cout); + raft::print_device_vector("Labels array: ", search_labels_, rows_, + std::cout); auto stream = handle_.get_stream(); @@ -72,40 +71,26 @@ class KNNTest : public ::testing::TestWithParam { input_vec.push_back(input_); sizes_vec.push_back(rows_); - - brute_force_knn(handle_, - input_vec, - sizes_vec, - cols_, - search_data_, - rows_, - indices_, - distances_, - k_, - true, - true); + brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_, + indices_, distances_, k_, true, true); build_actual_output<<>>( - actual_labels_, rows_, - k_, search_labels_, - indices_); + actual_labels_, rows_, k_, search_labels_, indices_); build_expected_output<<>>( - expected_labels_, rows_, - k_, search_labels_); - - raft::print_device_vector("Output indices: ", indices_, - rows_ * k_, std::cout); - raft::print_device_vector("Output distances: ", distances_, - rows_ * k_, std::cout); - raft::print_device_vector("Output labels: ", actual_labels_, - rows_ * k_, std::cout); - raft::print_device_vector("Expected labels: ", expected_labels_, - rows_ * k_, std::cout); - - ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, - rows_ * k_, - raft::Compare())); + expected_labels_, rows_, k_, search_labels_); + + raft::print_device_vector("Output indices: ", indices_, rows_ * k_, + std::cout); + raft::print_device_vector("Output distances: ", distances_, rows_ * k_, + std::cout); + raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, + std::cout); + raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, + std::cout); + + ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, + raft::Compare())); } void SetUp() override { @@ -120,12 +105,12 @@ class KNNTest : public ::testing::TestWithParam { row_major_input.push_back(params_.input[i][j]); } } - rmm::device_buffer input_d = rmm::device_buffer(row_major_input.data(), - row_major_input.size() * sizeof(float)); + rmm::device_buffer input_d = rmm::device_buffer( + row_major_input.data(), row_major_input.size() * sizeof(float)); float *input_ptr = static_cast(input_d.data()); - rmm::device_buffer labels_d = rmm::device_buffer(params_.labels.data(), - params_.labels.size() * sizeof(int)); + rmm::device_buffer labels_d = rmm::device_buffer( + params_.labels.data(), params_.labels.size() * sizeof(int)); int *labels_ptr = static_cast(labels_d.data()); raft::allocate(input_, rows_ * cols_, true); @@ -154,7 +139,7 @@ class KNNTest : public ::testing::TestWithParam { float *input_; float *search_data_; int64_t *indices_; - float* distances_; + float *distances_; int k_; int *search_labels_; @@ -162,31 +147,27 @@ class KNNTest : public ::testing::TestWithParam { int *expected_labels_; }; - const std::vector inputs = { // 2D - { - { - { 2.7810836,2.550537003 }, - { 1.465489372,2.362125076}, - { 3.396561688,4.400293529}, - { 1.38807019,1.850220317 }, - { 3.06407232,3.005305973 }, - { 7.627531214,2.759262235 }, - { 5.332441248,2.088626775 }, - { 6.922596716,1.77106367 }, - { 8.675418651,-0.242068655}, - {7.673756466,3.508563011 }, - }, - 2, - {0, 0, 0, 0, 0, 1, 1, 1, 1, 1} - } -}; + {{ + {2.7810836, 2.550537003}, + {1.465489372, 2.362125076}, + {3.396561688, 4.400293529}, + {1.38807019, 1.850220317}, + {3.06407232, 3.005305973}, + {7.627531214, 2.759262235}, + {5.332441248, 2.088626775}, + {6.922596716, 1.77106367}, + {8.675418651, -0.242068655}, + {7.673756466, 3.508563011}, + }, + 2, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}}; typedef KNNTest KNNTestF; TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); } INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs)); -} // namespace knn -} // namespace raft +} // namespace knn +} // namespace raft From fe5a378c93b3e1a719b10ec9f72d6a5528e46a07 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Mon, 1 Feb 2021 11:50:44 -0600 Subject: [PATCH 17/19] Update license --- .../raft/spatial/knn/detail/brute_force_knn.hpp | 16 ++++++++++++++++ .../raft/spatial/knn/detail/processing.hpp | 2 +- cpp/include/raft/spatial/knn/knn.hpp | 2 +- cpp/test/spatial/knn.cu | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index caf79265c6..00316753d8 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 1b6589100f..9e86ac0550 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c)2020, NVIDIA CORPORATION. + * Copyright (c)2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 668f5291e1..2bba8bfb6c 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 37bb859a53..683bcae7d9 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 9315752555050877f934522a08709b5dc2daff8a Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Mon, 1 Feb 2021 11:57:42 -0600 Subject: [PATCH 18/19] Add spatial namespace --- cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp | 2 ++ cpp/include/raft/spatial/knn/detail/processing.hpp | 6 ++++-- cpp/include/raft/spatial/knn/knn.hpp | 2 ++ cpp/test/spatial/knn.cu | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp index 00316753d8..1ca5be2052 100644 --- a/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp +++ b/cpp/include/raft/spatial/knn/detail/brute_force_knn.hpp @@ -36,6 +36,7 @@ #include "processing.hpp" namespace raft { +namespace spatial { namespace knn { namespace detail { @@ -355,4 +356,5 @@ void brute_force_knn_impl( } // namespace detail } // namespace knn +} // namespace spatial } // namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 9e86ac0550..a645412c2f 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -25,6 +25,7 @@ #include namespace raft { +namespace spatial { namespace knn { using deviceAllocator = raft::mr::device::allocator; @@ -186,5 +187,6 @@ template class CosineMetricProcessor; template class CorrelationMetricProcessor; template class DefaultMetricProcessor; -}; // namespace knn -}; // namespace raft +} // namespace knn +} // namespace spatial +} // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 2bba8bfb6c..ccee635701 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -22,6 +22,7 @@ #include namespace raft { +namespace spatial { namespace knn { using deviceAllocator = raft::mr::device::allocator; @@ -68,4 +69,5 @@ void brute_force_knn( } } // namespace knn +} // namespace spatial } // namespace raft diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 683bcae7d9..cfd4ecc9d1 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -23,6 +23,7 @@ #include "../test_utils.h" namespace raft { +namespace spatial { namespace knn { struct KNNInputs { std::vector> input; @@ -170,4 +171,5 @@ TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); } INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs)); } // namespace knn +} // namespace spatial } // namespace raft From 03c5a3574d44b8552fead8a9e9cb0bd3c9d5adf3 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Mon, 1 Feb 2021 13:25:11 -0500 Subject: [PATCH 19/19] Prepare Changelog for Automation (#3442) This PR prepares the changelog to be automatically updated during releases. Authors: - AJ Schmidt (@ajschmidt8) Approvers: - Dante Gama Dessavre (@dantegd) URL: https://github.com/rapidsai/raft/pull/135 --- CHANGELOG.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4747bca4e2..c1fdbbef24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,8 @@ -# RAFT 0.18.0 (Date TBD) +# 0.18.0 -## New Features - -## Improvements - -## Bug Fixes +Please see https://github.com/rapidsai/raft/releases/tag/branch-0.18-latest for the latest changes to this development branch. -# RAFT 0.17.0 (Date TBD) +# RAFT 0.17.0 (10 Dec 2020) ## New Features - PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects