diff --git a/build.sh b/build.sh
index 28285a1f1d..82c907ad28 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 # cuml build script
 
@@ -47,7 +47,6 @@ HELP="$0 [<target> ...] [<flag> ...]
                        and profiling enabled (WARNING: Impacts performance)
    --ccache          - Use ccache to cache previous compilations
    --nocloneraft     - CMake will clone RAFT even if it is in the environment, use this flag to disable that behavior
-   --static-faiss    - Force CMake to use the FAISS static libs, cloning and building them if necessary
    --static-treelite - Force CMake to use the Treelite static libs, cloning and building them if necessary
 
  default action (no args) is to build and install 'libcuml', 'cuml', and 'prims' targets only for the detected GPU arch
@@ -78,7 +77,6 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON
 BUILD_CUML_STD_COMMS=ON
 BUILD_CUML_TESTS=ON
 BUILD_CUML_MG_TESTS=OFF
-BUILD_STATIC_FAISS=OFF
 BUILD_STATIC_TREELITE=OFF
 CMAKE_LOG_LEVEL=WARNING
 
@@ -199,9 +197,6 @@ while true; do
         --nocloneraft )
             DISABLE_FORCE_CLONE_RAFT=ON
             ;;
-        --static-faiss )
-            BUILD_STATIC_FAISS=ON
-            ;;
         --static-treelite )
             BUILD_STATIC_TREELITE=ON
             ;;
@@ -256,7 +251,6 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri
           -DBUILD_CUML_TESTS=${BUILD_CUML_TESTS} \
           -DBUILD_CUML_MPI_COMMS=${BUILD_CUML_MG_TESTS} \
           -DBUILD_CUML_MG_TESTS=${BUILD_CUML_MG_TESTS} \
-          -DCUML_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
           -DCUML_USE_TREELITE_STATIC=${BUILD_STATIC_TREELITE} \
           -DNVTX=${NVTX} \
           -DUSE_CCACHE=${CCACHE} \
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 92a1522c9d..0feb1bf80b 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 26ae3d1d1f..2eca4a33ef 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 407f40f818..0581055647 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -36,7 +36,6 @@
 ]
 FILES_TO_EXCLUDE = [
     re.compile(r"cpp/src/tsne/cannylab/bh\.cu"),
-    re.compile(r"cpp/src/hdbscan/detail/faiss_mr\.hpp"),
 ]
 
 # this will break starting at year 10000, which is probably OK :)
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index 577532cbd7..1ec5d59380 100644
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fb3c7350fe..fa3988a33d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -63,19 +63,16 @@ option(NVTX "Enable nvtx markers" OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
 option(USE_CCACHE "Cache build artifacts with ccache" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
-option(CUML_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" OFF)
 option(CUML_USE_RAFT_STATIC "Build and statically link the RAFT libraries" OFF)
 option(CUML_RAFT_COMPILED "Use libraft shared library" ON)
 option(CUML_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF)
 option(CUML_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libcuml++" OFF)
 option(CUML_USE_CUMLPRIMS_MG_STATIC "Build and statically link the cumlprims_mg library" OFF)
-option(CUML_ENABLE_NN_DEPENDENCIES "Whether to enable FAISS dependency" ON)
 
 # The options below allow incorporating libcuml into another build process
 # without installing all its components. This is useful if total file size is
 # at a premium and we do not expect other consumers to use any APIs of the
 # dependency except those that are directly linked to by the dependent library.
-option(CUML_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' target" ON)
 option(CUML_EXCLUDE_RAFT_FROM_ALL "Exclude RAFT targets from cuML's 'all' target" OFF)
 option(CUML_EXCLUDE_TREELITE_FROM_ALL "Exclude Treelite targets from cuML's 'all' target" OFF)
 option(CUML_EXCLUDE_CUMLPRIMS_MG_FROM_ALL "Exclude cumlprims_mg targets from cuML's 'all' target" OFF)
@@ -100,7 +97,6 @@ message(VERBOSE "CUML_CPP: Disabling all mnmg components and comms libraries: ${
 message(VERBOSE "CUML_CPP: Cache build artifacts with ccache: ${USE_CCACHE}")
 message(VERBOSE "CUML_CPP: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUML_CPP: Build and statically link RAFT libraries: ${CUML_USE_RAFT_STATIC}")
-message(VERBOSE "CUML_CPP: Build and statically link FAISS library: ${CUML_USE_FAISS_STATIC}")
 message(VERBOSE "CUML_CPP: Build and statically link Treelite library: ${CUML_USE_TREELITE_STATIC}")
 
 set(CUML_ALGORITHMS "ALL" CACHE STRING "Experimental: Choose which algorithms are built into libcuml++.so. Can specify individual algorithms or groups in a semicolon-separated list.")
@@ -226,10 +222,6 @@ endif()
 
 include(cmake/thirdparty/get_raft.cmake)
 
-if(CUML_USE_RAFT_NN)
-  include(cmake/thirdparty/get_faiss.cmake)
-endif()
-
 if(LINK_TREELITE)
   include(cmake/thirdparty/get_treelite.cmake)
 endif()
@@ -610,7 +602,6 @@ if(BUILD_CUML_CPP_LIBRARY)
   # These are always private:
   list(APPEND _cuml_cpp_private_libs
     raft::raft
-    $<$<BOOL:${CUML_USE_RAFT_NN}>:faiss>
     $<TARGET_NAME_IF_EXISTS:GPUTreeShap::GPUTreeShap>
     $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft${_ctk_static_suffix}>
     ${TREELITE_LIBS}
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
deleted file mode 100644
index c4ef84f500..0000000000
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ /dev/null
@@ -1,89 +0,0 @@
-#=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_faiss)
-    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN} )
-
-    if(CUML_USE_RAFT_NN)
-        rapids_find_generate_module(faiss
-                HEADER_NAMES  faiss/IndexFlat.h
-                LIBRARY_NAMES faiss
-                )
-
-        set(BUILD_SHARED_LIBS ON)
-        if (PKG_BUILD_STATIC_LIBS)
-            set(BUILD_SHARED_LIBS OFF)
-            set(CPM_DOWNLOAD_faiss ON)
-        endif()
-
-        rapids_cpm_find(faiss ${PKG_VERSION}
-                GLOBAL_TARGETS     faiss::faiss
-                CPM_ARGS
-                GIT_REPOSITORY   ${PKG_REPOSITORY}
-                GIT_TAG          ${PKG_PINNED_TAG}
-                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
-                OPTIONS
-                "FAISS_ENABLE_PYTHON OFF"
-                "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
-                "FAISS_ENABLE_GPU ON"
-                "BUILD_TESTING OFF"
-                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
-                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
-                )
-
-        if(TARGET faiss AND NOT TARGET faiss::faiss)
-            add_library(faiss::faiss ALIAS faiss)
-        endif()
-
-        if(faiss_ADDED)
-            rapids_export(BUILD faiss
-                    EXPORT_SET faiss-targets
-                    GLOBAL_TARGETS faiss
-                    NAMESPACE faiss::)
-        endif()
-    endif()
-
-    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
-    rapids_export_package(BUILD OpenMP cuml-exports) # faiss uses openMP but doesn't export a need for it
-    rapids_export_package(BUILD faiss cuml-exports GLOBAL_TARGETS faiss::faiss faiss)
-    rapids_export_package(INSTALL faiss cuml-exports GLOBAL_TARGETS faiss::faiss faiss)
-
-    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] cuml-exports)
-endfunction()
-
-if(NOT CUML_FAISS_GIT_TAG)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(CUML_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0)
-    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
-endif()
-
-if(NOT CUML_FAISS_GIT_REPOSITORY)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(CUML_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git)
-    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
-endif()
-
-find_and_configure_faiss(VERSION    1.7.0
-        REPOSITORY  ${CUML_FAISS_GIT_REPOSITORY}
-        PINNED_TAG  ${CUML_FAISS_GIT_TAG}
-        BUILD_STATIC_LIBS ${CUML_USE_FAISS_STATIC}
-        EXCLUDE_FROM_ALL ${CUML_EXCLUDE_FAISS_FROM_ALL})
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 24b2db76ea..aa15858132 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -85,8 +85,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-      FORK             rapidsai
-      PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
+      FORK             benfred
+      PINNED_TAG       post_distance_op
       EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
       # When PINNED_TAG above doesn't match cuml,
       # force local raft clone in build directory
diff --git a/cpp/src/hdbscan/detail/faiss_mr.hpp b/cpp/src/hdbscan/detail/faiss_mr.hpp
deleted file mode 100644
index 6aeaf35b94..0000000000
--- a/cpp/src/hdbscan/detail/faiss_mr.hpp
+++ /dev/null
@@ -1,642 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
-This code contains unnecessary code duplication. These could be deleted
-once the relevant changes would be made on the FAISS side. Indeed most of
-the logic in the below code is similar to FAISS's standard implementation
-and should thus be inherited instead of duplicated. This FAISS's issue
-once solved should allow the removal of the unnecessary duplicates
-in this file : https://github.com/facebookresearch/faiss/issues/2097
-*/
-
-#pragma once
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <map>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
-
-namespace ML {
-namespace HDBSCAN {
-namespace detail {
-namespace Reachability {
-
-using namespace faiss::gpu;
-
-namespace {
-
-// How many streams per device we allocate by default (for multi-streaming)
-constexpr int kNumStreams = 2;
-
-// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
-
-// Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
-
-std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map)
-{
-  // Produce a sorted list of all outstanding allocations by type
-  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-
-  for (auto& entry : map) {
-    auto& a = entry.second;
-
-    auto it = stats.find(a.type);
-    if (it != stats.end()) {
-      stats[a.type].first++;
-      stats[a.type].second += a.size;
-    } else {
-      stats[a.type] = std::make_pair(1, a.size);
-    }
-  }
-
-  std::stringstream ss;
-  for (auto& entry : stats) {
-    ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first
-       << " allocations, " << entry.second.second << " bytes\n";
-  }
-
-  return ss.str();
-}
-
-}  // namespace
-
-/// RMM implementation of the GpuResources object that provides for a
-/// temporary memory manager
-class RmmGpuResourcesImpl : public GpuResources {
- public:
-  RmmGpuResourcesImpl()
-    : pinnedMemAlloc_(nullptr),
-      pinnedMemAllocSize_(0),
-      // let the adjustment function determine the memory size for us by passing
-      // in a huge value that will then be adjusted
-      tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
-      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-      allocLogging_(false),
-      cmr(new rmm::mr::cuda_memory_resource),
-      mmr(new rmm::mr::managed_memory_resource),
-      pmr(new rmm::mr::pinned_memory_resource){};
-
-  ~RmmGpuResourcesImpl()
-  {
-    // The temporary memory allocator has allocated memory through us, so clean
-    // that up before we finish fully de-initializing ourselves
-    tempMemory_.clear();
-
-    // Make sure all allocations have been freed
-    bool allocError = false;
-
-    for (auto& entry : allocs_) {
-      auto& map = entry.second;
-
-      if (!map.empty()) {
-        std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
-                  << "Device " << entry.first << " outstanding allocations:\n";
-        std::cerr << allocsToString(map);
-        allocError = true;
-      }
-    }
-
-    FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
-
-    for (auto& entry : defaultStreams_) {
-      DeviceScope scope(entry.first);
-
-      // We created these streams, so are responsible for destroying them
-      CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : alternateStreams_) {
-      DeviceScope scope(entry.first);
-
-      for (auto stream : entry.second) {
-        CUDA_VERIFY(cudaStreamDestroy(stream));
-      }
-    }
-
-    for (auto& entry : asyncCopyStreams_) {
-      DeviceScope scope(entry.first);
-
-      CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : blasHandles_) {
-      DeviceScope scope(entry.first);
-
-      auto blasStatus = cublasDestroy(entry.second);
-      FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    }
-
-    if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); }
-  };
-
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { setTempMemory(0); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size)
-  {
-    if (tempMemSize_ != size) {
-      // adjust based on general limits
-      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-
-      // We need to re-initialize memory resources for all current devices that
-      // have been initialized.
-      // This should be safe to do, even if we are currently running work, because
-      // the cudaFree call that this implies will force-synchronize all GPUs with
-      // the CPU
-      for (auto& p : tempMemory_) {
-        int device = p.first;
-        // Free the existing memory first
-        p.second.reset();
-
-        // Allocate new
-        p.second = std::unique_ptr<StackDeviceMemory>(
-          new StackDeviceMemory(this,
-                                p.first,
-                                // adjust for this specific device
-                                getDefaultTempMemForGPU(device, tempMemSize_)));
-      }
-    }
-  };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size)
-  {
-    // Should not call this after devices have been initialized
-    FAISS_ASSERT(defaultStreams_.size() == 0);
-    FAISS_ASSERT(!pinnedMemAlloc_);
-
-    pinnedMemSize_ = size;
-  };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream)
-  {
-    if (isInitialized(device)) {
-      // A new series of calls may not be ordered with what was the previous
-      // stream, so if the stream being specified is different, then we need to
-      // ensure ordering between the two (new stream waits on old).
-      auto it                 = userDefaultStreams_.find(device);
-      cudaStream_t prevStream = nullptr;
-
-      if (it != userDefaultStreams_.end()) {
-        prevStream = it->second;
-      } else {
-        FAISS_ASSERT(defaultStreams_.count(device));
-        prevStream = defaultStreams_[device];
-      }
-
-      if (prevStream != stream) { streamWait({stream}, {prevStream}); }
-    }
-
-    userDefaultStreams_[device] = stream;
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device)
-  {
-    if (isInitialized(device)) {
-      auto it = userDefaultStreams_.find(device);
-
-      if (it != userDefaultStreams_.end()) {
-        // There was a user stream set that we need to synchronize against
-        cudaStream_t prevStream = userDefaultStreams_[device];
-
-        FAISS_ASSERT(defaultStreams_.count(device));
-        cudaStream_t newStream = defaultStreams_[device];
-
-        streamWait({newStream}, {prevStream});
-      }
-    }
-
-    userDefaultStreams_.erase(device);
-  };
-
-  /// Returns the stream for the given device on which all Faiss GPU work is
-  /// ordered.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  cudaStream_t getDefaultStream(int device)
-  {
-    initializeForDevice(device);
-
-    auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There is a user override stream set
-      return it->second;
-    }
-
-    // Otherwise, our base default stream
-    return defaultStreams_[device];
-  };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices()
-  {
-    for (int dev = 0; dev < getNumDevices(); ++dev) {
-      setDefaultStream(dev, nullptr);
-    }
-  };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
-
- public:
-  /// Internal system calls
-
-  /// Initialize resources for this device
-  void initializeForDevice(int device)
-  {
-    if (isInitialized(device)) { return; }
-
-    // If this is the first device that we're initializing, create our
-    // pinned memory allocation
-    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-      pinnedMemAlloc_     = pmr->allocate(pinnedMemSize_);
-      pinnedMemAllocSize_ = pinnedMemSize_;
-    }
-
-    FAISS_ASSERT(device < getNumDevices());
-    DeviceScope scope(device);
-
-    // Make sure that device properties for all devices are cached
-    auto& prop = getDeviceProperties(device);
-
-    // Also check to make sure we meet our minimum compute capability (3.0)
-    FAISS_ASSERT_FMT(prop.major >= 3,
-                     "Device id %d with CC %d.%d not supported, "
-                     "need 3.0+ compute capability",
-                     device,
-                     prop.major,
-                     prop.minor);
-
-    // Create streams
-    cudaStream_t defaultStream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-
-    defaultStreams_[device] = defaultStream;
-
-    cudaStream_t asyncCopyStream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-
-    asyncCopyStreams_[device] = asyncCopyStream;
-
-    std::vector<cudaStream_t> deviceStreams;
-    for (int j = 0; j < kNumStreams; ++j) {
-      cudaStream_t stream = 0;
-      CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-      deviceStreams.push_back(stream);
-    }
-
-    alternateStreams_[device] = std::move(deviceStreams);
-
-    // Create cuBLAS handle
-    cublasHandle_t blasHandle = 0;
-    auto blasStatus           = cublasCreate(&blasHandle);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    blasHandles_[device] = blasHandle;
-
-    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-    // rounding down of inputs to f16 (though accumulate in f32) which results in
-    // unacceptable loss of precision in general.
-    // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-    // a loss of precision.
-#if CUDA_VERSION >= 11000
-    cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-#endif
-
-    FAISS_ASSERT(allocs_.count(device) == 0);
-    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-
-    FAISS_ASSERT(tempMemory_.count(device) == 0);
-    auto mem = std::unique_ptr<StackDeviceMemory>(
-      new StackDeviceMemory(this,
-                            device,
-                            // adjust for this specific device
-                            getDefaultTempMemForGPU(device, tempMemSize_)));
-
-    tempMemory_.emplace(device, std::move(mem));
-  };
-
-  cublasHandle_t getBlasHandle(int device)
-  {
-    initializeForDevice(device);
-    return blasHandles_[device];
-  };
-
-  std::vector<cudaStream_t> getAlternateStreams(int device)
-  {
-    initializeForDevice(device);
-    return alternateStreams_[device];
-  };
-
-  /// Allocate non-temporary GPU memory
-  void* allocMemory(const AllocRequest& req)
-  {
-    initializeForDevice(req.device);
-
-    // We don't allocate a placeholder for zero-sized allocations
-    if (req.size == 0) { return nullptr; }
-
-    // Make sure that the allocation is a multiple of 16 bytes for alignment
-    // purposes
-    auto adjReq = req;
-    adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
-
-    void* p = nullptr;
-
-    if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; }
-
-    if (adjReq.space == MemorySpace::Temporary) {
-      // If we don't have enough space in our temporary memory manager, we need
-      // to allocate this request separately
-      auto& tempMem = tempMemory_[adjReq.device];
-
-      if (adjReq.size > tempMem->getSizeAvailable()) {
-        // We need to allocate this ourselves
-        AllocRequest newReq = adjReq;
-        newReq.space        = MemorySpace::Device;
-        newReq.type         = AllocType::TemporaryMemoryOverflow;
-
-        return allocMemory(newReq);
-      }
-
-      // Otherwise, we can handle this locally
-      p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-
-    } else if (adjReq.space == MemorySpace::Device) {
-      p = cmr->allocate(adjReq.size, adjReq.stream);
-    } else if (adjReq.space == MemorySpace::Unified) {
-      p = mmr->allocate(adjReq.size, adjReq.stream);
-    } else {
-      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
-    }
-
-    allocs_[adjReq.device][p] = adjReq;
-
-    return p;
-  };
-
-  /// Returns a previous allocation
-  void deallocMemory(int device, void* p)
-  {
-    FAISS_ASSERT(isInitialized(device));
-
-    if (!p) { return; }
-
-    auto& a = allocs_[device];
-    auto it = a.find(p);
-    FAISS_ASSERT(it != a.end());
-
-    auto& req = it->second;
-
-    if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
-
-    if (req.space == MemorySpace::Temporary) {
-      tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-    } else if (req.space == MemorySpace::Device) {
-      cmr->deallocate(p, req.size, req.stream);
-    } else if (req.space == MemorySpace::Unified) {
-      mmr->deallocate(p, req.size, req.stream);
-    } else {
-      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
-    }
-
-    a.erase(it);
-  };
-
-  size_t getTempMemoryAvailable(int device) const
-  {
-    FAISS_ASSERT(isInitialized(device));
-
-    auto it = tempMemory_.find(device);
-    FAISS_ASSERT(it != tempMemory_.end());
-
-    return it->second->getSizeAvailable();
-  };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-  {
-    using AT = std::map<std::string, std::pair<int, size_t>>;
-
-    std::map<int, AT> out;
-
-    for (auto& entry : allocs_) {
-      AT outDevice;
-
-      for (auto& a : entry.second) {
-        auto& v = outDevice[allocTypeToString(a.second.type)];
-        v.first++;
-        v.second += a.second.size;
-      }
-
-      out[entry.first] = std::move(outDevice);
-    }
-
-    return out;
-  };
-
-  std::pair<void*, size_t> getPinnedMemory()
-  {
-    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
-  };
-
-  cudaStream_t getAsyncCopyStream(int device)
-  {
-    initializeForDevice(device);
-    return asyncCopyStreams_[device];
-  };
-
- private:
-  /// Have GPU resources been initialized for this device yet?
-  bool isInitialized(int device) const
-  {
-    // Use default streams as a marker for whether or not a certain
-    // device has been initialized
-    return defaultStreams_.count(device) != 0;
-  };
-
-  /// Adjust the default temporary memory allocation based on the total GPU
-  /// memory size
-  static size_t getDefaultTempMemForGPU(int device, size_t requested)
-  {
-    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
-                                 : std::numeric_limits<size_t>::max();
-
-    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-
-      if (requested > k4GiBTempMem) { return k4GiBTempMem; }
-    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-
-      if (requested > k8GiBTempMem) { return k8GiBTempMem; }
-    } else {
-      // Never use more than 1.5 GiB
-      if (requested > kMaxTempMem) { return kMaxTempMem; }
-    }
-
-    // use whatever lower limit the user requested
-    return requested;
-  };
-
- private:
-  /// Set of currently outstanding memory allocations per device
-  /// device -> (alloc request, allocated ptr)
-  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-
-  /// Temporary memory provider, per each device
-  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-
-  /// Our default stream that work is ordered on, one per each device
-  std::unordered_map<int, cudaStream_t> defaultStreams_;
-
-  /// This contains particular streams as set by the user for
-  /// ordering, if any
-  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-
-  /// Other streams we can use, per each device
-  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-
-  /// Async copy stream to use for GPU <-> CPU pinned memory copies
-  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-
-  /// cuBLAS handle for each device
-  std::unordered_map<int, cublasHandle_t> blasHandles_;
-
-  /// Pinned memory allocation for use with this GPU
-  void* pinnedMemAlloc_;
-  size_t pinnedMemAllocSize_;
-
-  /// Another option is to use a specified amount of memory on all
-  /// devices
-  size_t tempMemSize_;
-
-  /// Amount of pinned memory we should allocate
-  size_t pinnedMemSize_;
-
-  /// Whether or not we log every GPU memory allocation and deallocation
-  bool allocLogging_;
-
-  // cuda_memory_resource
-  std::unique_ptr<rmm::mr::device_memory_resource> cmr;
-
-  // managed_memory_resource
-  std::unique_ptr<rmm::mr::device_memory_resource> mmr;
-
-  // pinned_memory_resource
-  std::unique_ptr<rmm::mr::host_memory_resource> pmr;
-};
-
-/// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory.
-/// Internally, the Faiss GPU code uses the instance managed by getResources,
-/// but this is the user-facing object that is internally reference counted.
-class RmmGpuResources : public GpuResourcesProvider {
- public:
-  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
-
-  ~RmmGpuResources(){};
-
-  std::shared_ptr<GpuResources> getResources() { return res_; };
-
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { res_->noTempMemory(); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size) { res_->setTempMemory(size); };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream)
-  {
-    res_->setDefaultStream(device, stream);
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-  {
-    return res_->getMemoryInfo();
-  };
-
-  /// Returns the current default stream
-  cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); };
-
-  /// Returns the current amount of temp memory available
-  size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); };
-
-  /// Synchronize our default stream with the CPU
-  void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); };
-
- private:
-  std::shared_ptr<RmmGpuResourcesImpl> res_;
-};
-
-}  // namespace Reachability
-}  // namespace detail
-}  // namespace HDBSCAN
-}  // namespace ML
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index e543a0e5f4..0b2e19aaf6 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,12 @@
 
 #pragma once
 
-#include "reachability_faiss.cuh"
-
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <raft/linalg/unary_op.cuh>
 
+#include <raft/neighbors/brute_force.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
 
@@ -32,6 +31,10 @@
 #include <cuml/neighbors/knn.hpp>
 #include <raft/distance/distance.cuh>
 
+#if defined RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
@@ -62,8 +65,6 @@ void core_distances(
   ASSERT(n_neighbors >= min_samples,
          "the size of the neighborhood should be greater than or equal to min_samples");
 
-  int blocks = raft::ceildiv(n, (size_t)tpb);
-
   auto exec_policy = rmm::exec_policy(stream);
 
   auto indices = thrust::make_counting_iterator<value_idx>(0);
@@ -162,6 +163,67 @@ void _compute_core_dists(const raft::handle_t& handle,
   core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
 }
 
+//  Functor to post-process distances into reachability space
+template <typename value_idx, typename value_t>
+struct ReachabilityPostProcess {
+  DI value_t operator()(value_t value, value_idx row, value_idx col) const
+  {
+    return max(core_dists[col], max(core_dists[row], alpha * value));
+  }
+
+  const value_t* core_dists;
+  value_t alpha;
+};
+
+/**
+ * Given core distances, Fuses computations of L2 distances between all
+ * points, projection into mutual reachability space, and k-selection.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle for resource reuse
+ * @param[out] out_inds  output indices array (size m * k)
+ * @param[out] out_dists output distances array (size m * k)
+ * @param[in] X input data points (size m * n)
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] k neighborhood size (includes self-loop)
+ * @param[in] core_dists array of core distances (size m)
+ */
+template <typename value_idx, typename value_t>
+void mutual_reachability_knn_l2(const raft::handle_t& handle,
+                                value_idx* out_inds,
+                                value_t* out_dists,
+                                const value_t* X,
+                                size_t m,
+                                size_t n,
+                                int k,
+                                value_t* core_dists,
+                                value_t alpha)
+{
+  // Create a functor to postprocess distances into mutual reachability space
+  // Note that we can't use a lambda for this here, since we get errors like:
+  // `A type local to a function cannot be used in the template argument of the
+  // enclosing parent function (and any parent classes) of an extended __device__
+  // or __host__ __device__ lambda`
+  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+
+  auto X_view = raft::make_device_matrix_view(X, m, n);
+  std::vector<raft::device_matrix_view<const value_t, size_t>> index = {X_view};
+
+  raft::neighbors::brute_force::knn<value_idx, value_t>(
+    handle,
+    index,
+    X_view,
+    raft::make_device_matrix_view(out_inds, m, static_cast<size_t>(k)),
+    raft::make_device_matrix_view(out_dists, m, static_cast<size_t>(k)),
+    // TODO: expand distance metrics to support more than just L2 distance
+    // https://github.com/rapidsai/cuml/issues/5301
+    raft::distance::DistanceType::L2SqrtExpanded,
+    std::make_optional<float>(2.0f),
+    std::nullopt,
+    epilogue);
+}
+
 /**
  * Constructs a mutual reachability graph, which is a k-nearest neighbors
  * graph projected into mutual reachability space using the following
@@ -268,4 +330,4 @@ void mutual_reachability_graph(const raft::handle_t& handle,
 };  // end namespace Reachability
 };  // end namespace detail
 };  // end namespace HDBSCAN
-};  // end namespace ML
\ No newline at end of file
+};  // end namespace ML
diff --git a/cpp/src/hdbscan/detail/reachability_faiss.cuh b/cpp/src/hdbscan/detail/reachability_faiss.cuh
deleted file mode 100644
index 255bbbc98a..0000000000
--- a/cpp/src/hdbscan/detail/reachability_faiss.cuh
+++ /dev/null
@@ -1,418 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file thirdparty/LICENSES/LICENSE.faiss
- */
-
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuResources.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissException.h>
-
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/DistanceUtils.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/impl/L2Select.cuh>
-
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/MatrixMult.cuh>
-
-#include <raft/core/handle.hpp>
-#include <raft/util/cuda_utils.cuh>
-
-#include <rmm/device_uvector.hpp>
-
-#include <cstddef>
-
-#include "faiss_mr.hpp"
-
-namespace ML {
-namespace HDBSCAN {
-namespace detail {
-namespace Reachability {
-
-template <typename value_t, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
-__global__ void l2SelectMinK(faiss::gpu::Tensor<value_t, 2, true> inner_products,
-                             faiss::gpu::Tensor<value_t, 1, true> sq_norms,
-                             faiss::gpu::Tensor<value_t, 1, true> core_dists,
-                             faiss::gpu::Tensor<value_t, 2, true> out_dists,
-                             faiss::gpu::Tensor<int, 2, true> out_inds,
-                             int batch_offset_i,
-                             int batch_offset_j,
-                             int k,
-                             value_t initK,
-                             value_t alpha)
-{
-  // Each block handles a single row of the distances (results)
-  constexpr int kNumWarps = ThreadsPerBlock / 32;
-
-  __shared__ value_t smemK[kNumWarps * NumWarpQ];
-  __shared__ int smemV[kNumWarps * NumWarpQ];
-
-  faiss::gpu::BlockSelect<value_t,
-                          int,
-                          false,
-                          faiss::gpu::Comparator<value_t>,
-                          NumWarpQ,
-                          NumThreadQ,
-                          ThreadsPerBlock>
-    heap(initK, -1, smemK, smemV, k);
-
-  int row = blockIdx.x;
-
-  // Whole warps must participate in the selection
-  int limit = faiss::gpu::utils::roundDown(inner_products.getSize(1), 32);
-  int i     = threadIdx.x;
-
-  for (; i < limit; i += blockDim.x) {
-    value_t v = sqrt(faiss::gpu::Math<value_t>::add(
-      sq_norms[row + batch_offset_i],
-      faiss::gpu::Math<value_t>::add(sq_norms[i + batch_offset_j], inner_products[row][i])));
-
-    v = max(core_dists[i + batch_offset_j], max(core_dists[row + batch_offset_i], alpha * v));
-    heap.add(v, i);
-  }
-
-  if (i < inner_products.getSize(1)) {
-    value_t v = sqrt(faiss::gpu::Math<value_t>::add(
-      sq_norms[row + batch_offset_i],
-      faiss::gpu::Math<value_t>::add(sq_norms[i + batch_offset_j], inner_products[row][i])));
-
-    v = max(core_dists[i + batch_offset_j], max(core_dists[row + batch_offset_i], alpha * v));
-    heap.addThreadQ(v, i);
-  }
-
-  heap.reduce();
-  for (int i = threadIdx.x; i < k; i += blockDim.x) {
-    out_dists[row][i] = smemK[i];
-    out_inds[row][i]  = smemV[i];
-  }
-}
-
-/**
- * Computes expanded L2 metric, projects points into reachability
- * space, and performs a k-select.
- * @tparam value_t
- * @param[in] productDistances Tensor (or blocked view) of inner products
- * @param[in] centroidDistances Tensor of l2 norms
- * @param[in] coreDistances Tensor of core distances
- * @param[out] outDistances Tensor of output distances
- * @param[out] outIndices Tensor of output indices
- * @param[in] batch_offset starting row (used when productDistances is a batch)
- * @param[in] k number of neighbors to select
- * @param[in] stream cuda stream for ordering gpu computations
- */
-template <typename value_t>
-void runL2SelectMin(faiss::gpu::Tensor<value_t, 2, true>& productDistances,
-                    faiss::gpu::Tensor<value_t, 1, true>& centroidDistances,
-                    faiss::gpu::Tensor<value_t, 1, true>& coreDistances,
-                    faiss::gpu::Tensor<value_t, 2, true>& outDistances,
-                    faiss::gpu::Tensor<int, 2, true>& outIndices,
-                    int batch_offset_i,
-                    int batch_offset_j,
-                    int k,
-                    value_t alpha,
-                    cudaStream_t stream)
-{
-  FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
-  FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
-  //  FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
-  FAISS_ASSERT(outDistances.getSize(1) == k);
-  FAISS_ASSERT(outIndices.getSize(1) == k);
-  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-  auto grid = dim3(outDistances.getSize(0));
-
-#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)                    \
-  do {                                                                    \
-    l2SelectMinK<value_t, NUM_WARP_Q, NUM_THREAD_Q, BLOCK>                \
-      <<<grid, BLOCK, 0, stream>>>(productDistances,                      \
-                                   centroidDistances,                     \
-                                   coreDistances,                         \
-                                   outDistances,                          \
-                                   outIndices,                            \
-                                   batch_offset_i,                        \
-                                   batch_offset_j,                        \
-                                   k,                                     \
-                                   faiss::gpu::Limits<value_t>::getMax(), \
-                                   alpha);                                \
-  } while (0)
-
-  // block size 128 for everything <= 1024
-  if (k <= 32) {
-    RUN_L2_SELECT(128, 32, 2);
-  } else if (k <= 64) {
-    RUN_L2_SELECT(128, 64, 3);
-  } else if (k <= 128) {
-    RUN_L2_SELECT(128, 128, 3);
-  } else if (k <= 256) {
-    RUN_L2_SELECT(128, 256, 4);
-  } else if (k <= 512) {
-    RUN_L2_SELECT(128, 512, 8);
-  } else if (k <= 1024) {
-    RUN_L2_SELECT(128, 1024, 8);
-
-#if GPU_MAX_SELECTION_K >= 2048
-  } else if (k <= 2048) {
-    // smaller block for less shared memory
-    RUN_L2_SELECT(64, 2048, 8);
-#endif
-
-  } else {
-    FAISS_ASSERT(false);
-  }
-}
-
-/**
- * Given core distances, Fuses computations of L2 distances between all
- * points, projection into mutual reachability space, and k-selection.
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle for resource reuse
- * @param[out] out_inds  output indices array (size m * k)
- * @param[out] out_dists output distances array (size m * k)
- * @param[in] X input data points (size m * n)
- * @param[in] m number of rows in X
- * @param[in] n number of columns in X
- * @param[in] k neighborhood size (includes self-loop)
- * @param[in] core_dists array of core distances (size m)
- */
-template <typename value_idx, typename value_t>
-void mutual_reachability_knn_l2(const raft::handle_t& handle,
-                                value_idx* out_inds,
-                                value_t* out_dists,
-                                const value_t* X,
-                                size_t m,
-                                size_t n,
-                                int k,
-                                value_t* core_dists,
-                                value_t alpha)
-{
-  auto device = faiss::gpu::getCurrentDevice();
-  auto stream = handle.get_stream();
-
-  faiss::gpu::DeviceScope ds(device);
-  RmmGpuResources res;
-
-  res.noTempMemory();
-  res.setDefaultStream(device, stream);
-
-  auto resImpl = res.getResources();
-  auto gpu_res = resImpl.get();
-
-  gpu_res->initializeForDevice(device);
-  gpu_res->setDefaultStream(device, stream);
-
-  device = faiss::gpu::getCurrentDevice();
-
-  auto tmp_mem_cur_device = gpu_res->getTempMemoryAvailableCurrentDevice();
-
-  /**
-   * Compute L2 norm
-   */
-  rmm::device_uvector<value_t> norms(m, stream);
-
-  auto core_dists_tensor = faiss::gpu::toDeviceTemporary<value_t, 1>(
-    gpu_res,
-    device,
-    const_cast<value_t*>(reinterpret_cast<const value_t*>(core_dists)),
-    stream,
-    {(int)m});
-
-  auto x_tensor = faiss::gpu::toDeviceTemporary<value_t, 2>(
-    gpu_res,
-    device,
-    const_cast<value_t*>(reinterpret_cast<const value_t*>(X)),
-    stream,
-    {(int)m, (int)n});
-
-  auto out_dists_tensor = faiss::gpu::toDeviceTemporary<value_t, 2>(
-    gpu_res,
-    device,
-    const_cast<value_t*>(reinterpret_cast<const value_t*>(out_dists)),
-    stream,
-    {(int)m, k});
-
-  auto out_inds_tensor = faiss::gpu::toDeviceTemporary<value_idx, 2>(
-    gpu_res,
-    device,
-    const_cast<value_idx*>(reinterpret_cast<const value_idx*>(out_inds)),
-    stream,
-    {(int)m, k});
-
-  auto norms_tensor = faiss::gpu::toDeviceTemporary<value_t, 1>(
-    gpu_res,
-    device,
-    const_cast<value_t*>(reinterpret_cast<const value_t*>(norms.data())),
-    stream,
-    {(int)m});
-
-  runL2Norm(x_tensor, true, norms_tensor, true, stream);
-
-  /**
-   * Tile over PW dists, accumulating k-select
-   */
-
-  int tileRows = 0;
-  int tileCols = 0;
-  faiss::gpu::chooseTileSize(m, m, n, sizeof(value_t), tmp_mem_cur_device, tileRows, tileCols);
-
-  int numColTiles = raft::ceildiv(m, (size_t)tileCols);
-
-  faiss::gpu::DeviceTensor<value_t, 2, true> distanceBuf1(
-    gpu_res, faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream), {tileRows, tileCols});
-  faiss::gpu::DeviceTensor<value_t, 2, true> distanceBuf2(
-    gpu_res, faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream), {tileRows, tileCols});
-
-  faiss::gpu::DeviceTensor<value_t, 2, true>* distanceBufs[2] = {&distanceBuf1, &distanceBuf2};
-
-  faiss::gpu::DeviceTensor<value_t, 2, true> outDistanceBuf1(
-    gpu_res,
-    faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream),
-    {tileRows, numColTiles * k});
-  faiss::gpu::DeviceTensor<value_t, 2, true> outDistanceBuf2(
-    gpu_res,
-    faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream),
-    {tileRows, numColTiles * k});
-  faiss::gpu::DeviceTensor<value_t, 2, true>* outDistanceBufs[2] = {&outDistanceBuf1,
-                                                                    &outDistanceBuf2};
-
-  faiss::gpu::DeviceTensor<value_idx, 2, true> outIndexBuf1(
-    gpu_res,
-    faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream),
-    {tileRows, numColTiles * k});
-  faiss::gpu::DeviceTensor<value_idx, 2, true> outIndexBuf2(
-    gpu_res,
-    faiss::gpu::makeTempAlloc(faiss::gpu::AllocType::Other, stream),
-    {tileRows, numColTiles * k});
-  faiss::gpu::DeviceTensor<value_idx, 2, true>* outIndexBufs[2] = {&outIndexBuf1, &outIndexBuf2};
-
-  auto streams = gpu_res->getAlternateStreamsCurrentDevice();
-  faiss::gpu::streamWait(streams, {stream});
-
-  int curStream  = 0;
-  bool interrupt = false;
-
-  // Tile over the input queries
-  for (std::size_t i = 0; i < m; i += tileRows) {
-    if (interrupt || faiss::InterruptCallback::is_interrupted()) {
-      interrupt = true;
-      break;
-    }
-
-    int curQuerySize = std::min(static_cast<std::size_t>(tileRows), m - i);
-
-    auto outDistanceView = out_dists_tensor.narrow(0, i, curQuerySize);
-    auto outIndexView    = out_inds_tensor.narrow(0, i, curQuerySize);
-
-    auto queryView = x_tensor.narrow(0, i, curQuerySize);
-
-    auto outDistanceBufRowView = outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
-    auto outIndexBufRowView    = outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
-
-    // Tile over the centroids
-    for (std::size_t j = 0; j < m; j += tileCols) {
-      if (faiss::InterruptCallback::is_interrupted()) {
-        interrupt = true;
-        break;
-      }
-
-      int curCentroidSize = std::min(static_cast<std::size_t>(tileCols), m - j);
-      int curColTile      = j / tileCols;
-
-      auto centroidsView = sliceCentroids(x_tensor, true, j, curCentroidSize);
-
-      auto distanceBufView =
-        distanceBufs[curStream]->narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
-
-      auto outDistanceBufColView = outDistanceBufRowView.narrow(1, k * curColTile, k);
-      auto outIndexBufColView    = outIndexBufRowView.narrow(1, k * curColTile, k);
-
-      runMatrixMult(distanceBufView,
-                    false,  // not transposed
-                    queryView,
-                    false,  // transposed MM if col major
-                    centroidsView,
-                    true,  // transposed MM if row major
-                    -2.0f,
-                    0.0f,
-                    gpu_res->getBlasHandleCurrentDevice(),
-                    streams[curStream]);
-
-      if (static_cast<std::size_t>(tileCols) == m) {
-        // Write into the final output
-        runL2SelectMin<value_t>(distanceBufView,
-                                norms_tensor,
-                                core_dists_tensor,
-                                outDistanceView,
-                                outIndexView,
-                                i,
-                                j,
-                                k,
-                                alpha,
-                                streams[curStream]);
-      } else {
-        // Write into our intermediate output
-        runL2SelectMin<value_t>(distanceBufView,
-                                norms_tensor,
-                                core_dists_tensor,
-                                outDistanceBufColView,
-                                outIndexBufColView,
-                                i,
-                                j,
-                                k,
-                                alpha,
-                                streams[curStream]);
-      }
-    }
-
-    // As we're finished with processing a full set of centroids, perform
-    // the final k-selection
-    if (static_cast<std::size_t>(tileCols) != m) {
-      // The indices are tile-relative; for each tile of k, we need to add
-      // tileCols to the index
-      faiss::gpu::runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
-
-      faiss::gpu::runBlockSelectPair(outDistanceBufRowView,
-                                     outIndexBufRowView,
-                                     outDistanceView,
-                                     outIndexView,
-                                     false,
-                                     k,
-                                     streams[curStream]);
-    }
-
-    curStream = (curStream + 1) % 2;
-  }
-
-  // Have the desired ordering stream wait on the multi-stream
-  faiss::gpu::streamWait({stream}, streams);
-
-  if (interrupt) { FAISS_THROW_MSG("interrupted"); }
-}
-
-};  // end namespace Reachability
-};  // end namespace detail
-};  // end namespace HDBSCAN
-};  // end namespace ML
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f0f8437070..f919b1a608 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -45,7 +45,6 @@ function(ConfigureTest)
     rmm::rmm
     raft::raft
     $<$<BOOL:${CUML_RAFT_COMPILED}>:raft::compiled>
-    $<$<BOOL:${CUML_USE_RAFT_NN}>:faiss>
     GTest::gtest
     GTest::gtest_main
     ${OpenMP_CXX_LIB_NAMES}
diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index ba52652ef4..8c4ebb854d 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -115,7 +115,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::Common":
                                   int* labels,
                                   int* inverse_label_map,
                                   int n_selected_clusters,
-                                  PredictionData[int, float]& prediction_data)
+                                  PredictionData[int, float]& prediction_data) except +
 
 cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
 
@@ -125,7 +125,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
                  DistanceType metric,
                  HDBSCANParams & params,
                  hdbscan_output & output,
-                 float * core_dists)
+                 float * core_dists) except +
 
     void build_condensed_hierarchy(
       const handle_t &handle,
@@ -134,7 +134,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
       const int *sizes,
       int min_cluster_size,
       int n_leaves,
-      CondensedHierarchy[int, float] &condensed_tree)
+      CondensedHierarchy[int, float] &condensed_tree) except +
 
     void _extract_clusters(const handle_t &handle, size_t n_leaves,
                            int n_edges, int *parents, int *children,
@@ -142,7 +142,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
                            float *probabilities,
                            CLUSTER_SELECTION_METHOD cluster_selection_method,
                            bool allow_single_cluster, int max_cluster_size,
-                           float cluster_selection_epsilon)
+                           float cluster_selection_epsilon) except +
 
 cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::HELPER":
 
@@ -152,7 +152,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::HELPER":
                             size_t m,
                             size_t n,
                             DistanceType metric,
-                            int min_samples)
+                            int min_samples) except +
 
     void compute_inverse_label_map(const handle_t& handle,
                                    CondensedHierarchy[int, float]&
@@ -163,7 +163,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::HELPER":
                                    device_uvector[int]& inverse_label_map,
                                    bool allow_single_cluster,
                                    int max_cluster_size,
-                                   float cluster_selection_epsilon)
+                                   float cluster_selection_epsilon) except +
 
 _metrics_mapping = {
     'l1': DistanceType.L1,