From 9e18070088cc5a8ba80a36c3d5ebcabcb8d6d109 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 20 Oct 2021 14:56:53 +0200
Subject: [PATCH 1/7] RMM for FAISS

---
 cpp/include/raft/mr/faiss_mr.hpp              | 642 ++++++++++++++++++
 cpp/include/raft/spatial/knn/ann.hpp          |   2 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   4 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   5 +-
 .../spatial/knn/detail/haversine_distance.cuh |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   4 +-
 .../spatial/knn/detail/selection_faiss.cuh    |   2 +-
 7 files changed, 651 insertions(+), 10 deletions(-)
 create mode 100644 cpp/include/raft/mr/faiss_mr.hpp
diff --git a/cpp/include/raft/mr/faiss_mr.hpp b/cpp/include/raft/mr/faiss_mr.hpp
new file mode 100644
index 0000000000..844ea6df13
--- /dev/null
+++ b/cpp/include/raft/mr/faiss_mr.hpp
@@ -0,0 +1,642 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StackDeviceMemory.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace raft {
+namespace mr {
+
+using namespace faiss::gpu;
+
+namespace {
+
+// How many streams per device we allocate by default (for multi-streaming)
+constexpr int kNumStreams = 2;
+
+// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 4 GiB memory GPUs
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 8 GiB memory GPUs
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
+
+// Maximum temporary memory allocation for all GPUs
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
+
+std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
+  // Produce a sorted list of all outstanding allocations by type
+  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+
+  for (auto& entry : map) {
+    auto& a = entry.second;
+
+    auto it = stats.find(a.type);
+    if (it != stats.end()) {
+      stats[a.type].first++;
+      stats[a.type].second += a.size;
+    } else {
+      stats[a.type] = std::make_pair(1, a.size);
+    }
+  }
+
+  std::stringstream ss;
+  for (auto& entry : stats) {
+    ss << "Alloc type " << allocTypeToString(entry.first) << ": "
+       << entry.second.first << " allocations, " << entry.second.second
+       << " bytes\n";
+  }
+
+  return ss.str();
+}
+
+}  // namespace
+
+/// RMM implementation of the GpuResources object that provides for a
+/// temporary memory manager
+class RmmGpuResourcesImpl : public GpuResources {
+ public:
+  RmmGpuResourcesImpl()
+    : pinnedMemAlloc_(nullptr),
+      pinnedMemAllocSize_(0),
+      // let the adjustment function determine the memory size for us by passing
+      // in a huge value that will then be adjusted
+      tempMemSize_(
+        getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
+      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+      allocLogging_(false),
+      cmr(new rmm::mr::cuda_memory_resource),
+      mmr(new rmm::mr::managed_memory_resource),
+      pmr(new rmm::mr::pinned_memory_resource){};
+
+  ~RmmGpuResourcesImpl() {
+    // The temporary memory allocator has allocated memory through us, so clean
+    // that up before we finish fully de-initializing ourselves
+    tempMemory_.clear();
+
+    // Make sure all allocations have been freed
+    bool allocError = false;
+
+    for (auto& entry : allocs_) {
+      auto& map = entry.second;
+
+      if (!map.empty()) {
+        std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
+                  << "Device " << entry.first << " outstanding allocations:\n";
+        std::cerr << allocsToString(map);
+        allocError = true;
+      }
+    }
+
+    FAISS_ASSERT_MSG(!allocError,
+                     "GPU memory allocations not properly cleaned up");
+
+    for (auto& entry : defaultStreams_) {
+      DeviceScope scope(entry.first);
+
+      // We created these streams, so are responsible for destroying them
+      CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
+
+    for (auto& entry : alternateStreams_) {
+      DeviceScope scope(entry.first);
+
+      for (auto stream : entry.second) {
+        CUDA_VERIFY(cudaStreamDestroy(stream));
+      }
+    }
+
+    for (auto& entry : asyncCopyStreams_) {
+      DeviceScope scope(entry.first);
+
+      CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
+
+    for (auto& entry : blasHandles_) {
+      DeviceScope scope(entry.first);
+
+      auto blasStatus = cublasDestroy(entry.second);
+      FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    }
+
+    if (pinnedMemAlloc_) {
+      pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
+    }
+  };
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { setTempMemory(0); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size) {
+    if (tempMemSize_ != size) {
+      // adjust based on general limits
+      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+
+      // We need to re-initialize memory resources for all current devices that
+      // have been initialized.
+      // This should be safe to do, even if we are currently running work, because
+      // the cudaFree call that this implies will force-synchronize all GPUs with
+      // the CPU
+      for (auto& p : tempMemory_) {
+        int device = p.first;
+        // Free the existing memory first
+        p.second.reset();
+
+        // Allocate new
+        p.second = std::unique_ptr<StackDeviceMemory>(
+          new StackDeviceMemory(this, p.first,
+                                // adjust for this specific device
+                                getDefaultTempMemForGPU(device, tempMemSize_)));
+      }
+    }
+  };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size) {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
+
+    pinnedMemSize_ = size;
+  };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) {
+    if (isInitialized(device)) {
+      // A new series of calls may not be ordered with what was the previous
+      // stream, so if the stream being specified is different, then we need to
+      // ensure ordering between the two (new stream waits on old).
+      auto it = userDefaultStreams_.find(device);
+      cudaStream_t prevStream = nullptr;
+
+      if (it != userDefaultStreams_.end()) {
+        prevStream = it->second;
+      } else {
+        FAISS_ASSERT(defaultStreams_.count(device));
+        prevStream = defaultStreams_[device];
+      }
+
+      if (prevStream != stream) {
+        streamWait({stream}, {prevStream});
+      }
+    }
+
+    userDefaultStreams_[device] = stream;
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device) {
+    if (isInitialized(device)) {
+      auto it = userDefaultStreams_.find(device);
+
+      if (it != userDefaultStreams_.end()) {
+        // There was a user stream set that we need to synchronize against
+        cudaStream_t prevStream = userDefaultStreams_[device];
+
+        FAISS_ASSERT(defaultStreams_.count(device));
+        cudaStream_t newStream = defaultStreams_[device];
+
+        streamWait({newStream}, {prevStream});
+      }
+    }
+
+    userDefaultStreams_.erase(device);
+  };
+
+  /// Returns the stream for the given device on which all Faiss GPU work is
+  /// ordered.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  cudaStream_t getDefaultStream(int device) {
+    initializeForDevice(device);
+
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+      // There is a user override stream set
+      return it->second;
+    }
+
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
+  };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices() {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+      setDefaultStream(dev, nullptr);
+    }
+  };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
+
+ public:
+  /// Internal system calls
+
+  /// Initialize resources for this device
+  void initializeForDevice(int device) {
+    if (isInitialized(device)) {
+      return;
+    }
+
+    // If this is the first device that we're initializing, create our
+    // pinned memory allocation
+    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+      pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_);
+      pinnedMemAllocSize_ = pinnedMemSize_;
+    }
+
+    FAISS_ASSERT(device < getNumDevices());
+    DeviceScope scope(device);
+
+    // Make sure that device properties for all devices are cached
+    auto& prop = getDeviceProperties(device);
+
+    // Also check to make sure we meet our minimum compute capability (3.0)
+    FAISS_ASSERT_FMT(prop.major >= 3,
+                     "Device id %d with CC %d.%d not supported, "
+                     "need 3.0+ compute capability",
+                     device, prop.major, prop.minor);
+
+    // Create streams
+    cudaStream_t defaultStream = 0;
+    CUDA_VERIFY(
+      cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
+
+    defaultStreams_[device] = defaultStream;
+
+    cudaStream_t asyncCopyStream = 0;
+    CUDA_VERIFY(
+      cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
+
+    asyncCopyStreams_[device] = asyncCopyStream;
+
+    std::vector<cudaStream_t> deviceStreams;
+    for (int j = 0; j < kNumStreams; ++j) {
+      cudaStream_t stream = 0;
+      CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+      deviceStreams.push_back(stream);
+    }
+
+    alternateStreams_[device] = std::move(deviceStreams);
+
+    // Create cuBLAS handle
+    cublasHandle_t blasHandle = 0;
+    auto blasStatus = cublasCreate(&blasHandle);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    blasHandles_[device] = blasHandle;
+
+    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+    // rounding down of inputs to f16 (though accumulate in f32) which results in
+    // unacceptable loss of precision in general.
+    // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
+    // a loss of precision.
+#if CUDA_VERSION >= 11000
+    cublasSetMathMode(blasHandle,
+                      CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+#endif
+
+    FAISS_ASSERT(allocs_.count(device) == 0);
+    allocs_[device] = std::unordered_map<void*, AllocRequest>();
+
+    FAISS_ASSERT(tempMemory_.count(device) == 0);
+    auto mem = std::unique_ptr<StackDeviceMemory>(
+      new StackDeviceMemory(this, device,
+                            // adjust for this specific device
+                            getDefaultTempMemForGPU(device, tempMemSize_)));
+
+    tempMemory_.emplace(device, std::move(mem));
+  };
+
+  cublasHandle_t getBlasHandle(int device) {
+    initializeForDevice(device);
+    return blasHandles_[device];
+  };
+
+  std::vector<cudaStream_t> getAlternateStreams(int device) {
+    initializeForDevice(device);
+    return alternateStreams_[device];
+  };
+
+  /// Allocate non-temporary GPU memory
+  void* allocMemory(const AllocRequest& req) {
+    initializeForDevice(req.device);
+
+    // We don't allocate a placeholder for zero-sized allocations
+    if (req.size == 0) {
+      return nullptr;
+    }
+
+    // Make sure that the allocation is a multiple of 16 bytes for alignment
+    // purposes
+    auto adjReq = req;
+    adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
+
+    void* p = nullptr;
+
+    if (allocLogging_) {
+      std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n";
+    }
+
+    if (adjReq.space == MemorySpace::Temporary) {
+      // If we don't have enough space in our temporary memory manager, we need
+      // to allocate this request separately
+      auto& tempMem = tempMemory_[adjReq.device];
+
+      if (adjReq.size > tempMem->getSizeAvailable()) {
+        // We need to allocate this ourselves
+        AllocRequest newReq = adjReq;
+        newReq.space = MemorySpace::Device;
+        newReq.type = AllocType::TemporaryMemoryOverflow;
+
+        return allocMemory(newReq);
+      }
+
+      // Otherwise, we can handle this locally
+      p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+
+    } else if (adjReq.space == MemorySpace::Device) {
+      p = cmr->allocate(adjReq.size, adjReq.stream);
+    } else if (adjReq.space == MemorySpace::Unified) {
+      p = mmr->allocate(adjReq.size, adjReq.stream);
+    } else {
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
+    }
+
+    allocs_[adjReq.device][p] = adjReq;
+
+    return p;
+  };
+
+  /// Returns a previous allocation
+  void deallocMemory(int device, void* p) {
+    FAISS_ASSERT(isInitialized(device));
+
+    if (!p) {
+      return;
+    }
+
+    auto& a = allocs_[device];
+    auto it = a.find(p);
+    FAISS_ASSERT(it != a.end());
+
+    auto& req = it->second;
+
+    if (allocLogging_) {
+      std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n";
+    }
+
+    if (req.space == MemorySpace::Temporary) {
+      tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+    } else if (req.space == MemorySpace::Device) {
+      cmr->deallocate(p, req.size, req.stream);
+    } else if (req.space == MemorySpace::Unified) {
+      mmr->deallocate(p, req.size, req.stream);
+    } else {
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+    }
+
+    a.erase(it);
+  };
+
+  size_t getTempMemoryAvailable(int device) const {
+    FAISS_ASSERT(isInitialized(device));
+
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
+
+    return it->second->getSizeAvailable();
+  };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+    const {
+    using AT = std::map<std::string, std::pair<int, size_t>>;
+
+    std::map<int, AT> out;
+
+    for (auto& entry : allocs_) {
+      AT outDevice;
+
+      for (auto& a : entry.second) {
+        auto& v = outDevice[allocTypeToString(a.second.type)];
+        v.first++;
+        v.second += a.second.size;
+      }
+
+      out[entry.first] = std::move(outDevice);
+    }
+
+    return out;
+  };
+
+  std::pair<void*, size_t> getPinnedMemory() {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+  };
+
+  cudaStream_t getAsyncCopyStream(int device) {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
+  };
+
+ private:
+  /// Have GPU resources been initialized for this device yet?
+  bool isInitialized(int device) const {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
+  };
+
+  /// Adjust the default temporary memory allocation based on the total GPU
+  /// memory size
+  static size_t getDefaultTempMemForGPU(int device, size_t requested) {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
+
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+
+      if (requested > k4GiBTempMem) {
+        return k4GiBTempMem;
+      }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+
+      if (requested > k8GiBTempMem) {
+        return k8GiBTempMem;
+      }
+    } else {
+      // Never use more than 1.5 GiB
+      if (requested > kMaxTempMem) {
+        return kMaxTempMem;
+      }
+    }
+
+    // use whatever lower limit the user requested
+    return requested;
+  };
+
+ private:
+  /// Set of currently outstanding memory allocations per device
+  /// device -> (alloc request, allocated ptr)
+  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+
+  /// Temporary memory provider, per each device
+  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
+
+  /// Our default stream that work is ordered on, one per each device
+  std::unordered_map<int, cudaStream_t> defaultStreams_;
+
+  /// This contains particular streams as set by the user for
+  /// ordering, if any
+  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+
+  /// Other streams we can use, per each device
+  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
+
+  /// Async copy stream to use for GPU <-> CPU pinned memory copies
+  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+
+  /// cuBLAS handle for each device
+  std::unordered_map<int, cublasHandle_t> blasHandles_;
+
+  /// Pinned memory allocation for use with this GPU
+  void* pinnedMemAlloc_;
+  size_t pinnedMemAllocSize_;
+
+  /// Another option is to use a specified amount of memory on all
+  /// devices
+  size_t tempMemSize_;
+
+  /// Amount of pinned memory we should allocate
+  size_t pinnedMemSize_;
+
+  /// Whether or not we log every GPU memory allocation and deallocation
+  bool allocLogging_;
+
+  // cuda_memory_resource
+  std::unique_ptr<rmm::mr::device_memory_resource> cmr;
+
+  // managed_memory_resource
+  std::unique_ptr<rmm::mr::device_memory_resource> mmr;
+
+  // pinned_memory_resource
+  std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+};
+
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory.
+/// Internally, the Faiss GPU code uses the instance managed by getResources,
+/// but this is the user-facing object that is internally reference counted.
+class RmmGpuResources : public GpuResourcesProvider {
+ public:
+  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
+
+  ~RmmGpuResources(){};
+
+  std::shared_ptr<GpuResources> getResources() { return res_; };
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { res_->noTempMemory(); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size) { res_->setTempMemory(size); };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) {
+    res_->setDefaultStream(device, stream);
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices() {
+    res_->setDefaultNullStreamAllDevices();
+  };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+    const {
+    return res_->getMemoryInfo();
+  };
+
+  /// Returns the current default stream
+  cudaStream_t getDefaultStream(int device) {
+    return res_->getDefaultStream(device);
+  };
+
+  /// Returns the current amount of temp memory available
+  size_t getTempMemoryAvailable(int device) const {
+    return res_->getTempMemoryAvailable(device);
+  };
+
+  /// Synchronize our default stream with the CPU
+  void syncDefaultStreamCurrentDevice() {
+    res_->syncDefaultStreamCurrentDevice();
+  };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) {
+    res_->setLogMemoryAllocations(enable);
+  };
+
+ private:
+  std::shared_ptr<RmmGpuResourcesImpl> res_;
+};
+
+}  // namespace mr
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 2cdf9bf4f5..de7ff95da7 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -20,7 +20,7 @@
 #include "detail/ann_quantized_faiss.cuh"
 
 #include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/StandardGpuResources.h>
+#include <raft/mr/faiss_mr.hpp>
 
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 6a6c7751c2..e6ff4f7869 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -19,7 +19,7 @@
 #include <raft/linalg/distance_type.h>
 
 #include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/StandardGpuResources.h>
+#include <raft/mr/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
@@ -30,7 +30,7 @@ struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources *gpu_res;
+  raft::mr::RmmGpuResources *gpu_res;
   int device;
   ~knnIndex() {
     delete index;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 0e91b5225d..efb7ea14b7 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -28,6 +28,7 @@
 
 #include <label/classlabels.cuh>
 #include <raft/distance/distance.cuh>
+#include <raft/mr/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -35,7 +36,6 @@
 #include <faiss/gpu/GpuIndexIVFPQ.h>
 #include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
@@ -131,8 +131,7 @@ void approx_knn_build_index(raft::handle_t &handle,
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources *gpu_res =
-    new faiss::gpu::StandardGpuResources();
+  raft::mr::RmmGpuResources *gpu_res = new raft::mr::RmmGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
   index->gpu_res = gpu_res;
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 7d87254cb6..ccebea1dad 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -21,13 +21,13 @@
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
 #include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
+#include <raft/mr/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 3a3f0a6513..e5bde6851a 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -23,7 +23,6 @@
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
@@ -33,6 +32,7 @@
 #include <cstdint>
 #include <iostream>
 #include <raft/handle.hpp>
+#include <raft/mr/faiss_mr.hpp>
 #include <set>
 
 #include "fused_l2_knn.cuh"
@@ -284,7 +284,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
       default:
         faiss::MetricType m = build_faiss_metric(metric);
 
-        faiss::gpu::StandardGpuResources gpu_res;
+        raft::mr::RmmGpuResources gpu_res;
 
         gpu_res.noTempMemory();
         gpu_res.setDefaultStream(device, stream);
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 045edad0e6..0bf709a937 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <raft/mr/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>

From 2a7517db321eaaead8b5f38e2831befcc0ea9bf0 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 25 Oct 2021 17:11:59 +0200
Subject: [PATCH 2/7] Move faiss_mr.hpp and add tests

---
 .../raft/{mr => spatial/knn}/faiss_mr.hpp     |  3 +
 cpp/test/CMakeLists.txt                       |  1 +
 cpp/test/spatial/faiss_mr.cu                  | 91 +++++++++++++++++++
 3 files changed, 95 insertions(+)
 rename cpp/include/raft/{mr => spatial/knn}/faiss_mr.hpp (99%)
 create mode 100644 cpp/test/spatial/faiss_mr.cu

diff --git a/cpp/include/raft/mr/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
similarity index 99%
rename from cpp/include/raft/mr/faiss_mr.hpp
rename to cpp/include/raft/spatial/knn/faiss_mr.hpp
index 844ea6df13..4473ff0cd5 100644
--- a/cpp/include/raft/mr/faiss_mr.hpp
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -424,10 +424,13 @@ class RmmGpuResourcesImpl : public GpuResources {
     }
 
     if (req.space == MemorySpace::Temporary) {
+      std::cout << "dealloc Temporary" << std::endl;
       tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
     } else if (req.space == MemorySpace::Device) {
+      std::cout << "dealloc Device" << std::endl;
       cmr->deallocate(p, req.size, req.stream);
     } else if (req.space == MemorySpace::Unified) {
+      std::cout << "dealloc Unified" << std::endl;
       mmr->deallocate(p, req.size, req.stream);
     } else {
       FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 43e1c65695..4e991372aa 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -89,6 +89,7 @@ add_executable(test_raft
     test/spatial/knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
+    test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
     test/stats/mean.cu
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
new file mode 100644
index 0000000000..00e02e5571
--- /dev/null
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+
+#include <faiss/gpu/GpuResources.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+using namespace faiss::gpu;
+
+struct AllocInputs {
+  size_t size;
+};
+
+template <typename T>
+class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
+ public:
+  FAISS_MR_Test()
+    : params_(::testing::TestWithParam<AllocInputs>::GetParam()),
+      stream(handle.get_stream()) {}
+
+ protected:
+  size_t getFreeMemory(MemorySpace mem_space) {
+    if (mem_space == MemorySpace::Device) {
+      rmm::mr::cuda_memory_resource cmr;
+      rmm::mr::device_memory_resource* dmr = &cmr;
+      return dmr->get_mem_info(stream).first;
+    } else if (mem_space == MemorySpace::Unified) {
+      rmm::mr::managed_memory_resource mmr;
+      rmm::mr::device_memory_resource* dmr = &mmr;
+      return dmr->get_mem_info(stream).first;
+    }
+    return 0;
+  }
+
+  void testAllocs(MemorySpace mem_space) {
+    raft::mr::RmmGpuResources faiss_mr;
+    auto faiss_mr_impl = faiss_mr.getResources();
+    size_t free_before = getFreeMemory(mem_space);
+    AllocRequest req(AllocType::Other, 0, mem_space, stream, params_.size);
+    void* ptr = faiss_mr_impl->allocMemory(req);
+    size_t free_after_alloc = getFreeMemory(mem_space);
+    faiss_mr_impl->deallocMemory(0, ptr);
+    ASSERT_TRUE(free_after_alloc <= free_before - params_.size);
+  }
+
+  raft::handle_t handle;
+  cudaStream_t stream;
+  AllocInputs params_;
+};
+
+const std::vector<AllocInputs> inputs = {{19687}};
+
+typedef FAISS_MR_Test<float> FAISS_MR_TestF;
+TEST_P(FAISS_MR_TestF, TestAllocs) {
+  testAllocs(MemorySpace::Device);
+  testAllocs(MemorySpace::Unified);
+}
+
+INSTANTIATE_TEST_CASE_P(FAISS_MR_Test, FAISS_MR_TestF,
+                        ::testing::ValuesIn(inputs));
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft

From fbe01428fd921fc15060de7f3c10eebd459eb808 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 25 Oct 2021 18:11:50 +0200
Subject: [PATCH 3/7] Updating namespace

---
 cpp/include/raft/spatial/knn/ann.hpp                      | 2 +-
 cpp/include/raft/spatial/knn/ann_common.h                 | 4 ++--
 .../raft/spatial/knn/detail/ann_quantized_faiss.cuh       | 5 +++--
 .../raft/spatial/knn/detail/haversine_distance.cuh        | 2 +-
 .../raft/spatial/knn/detail/knn_brute_force_faiss.cuh     | 4 ++--
 cpp/include/raft/spatial/knn/detail/selection_faiss.cuh   | 2 +-
 cpp/include/raft/spatial/knn/faiss_mr.hpp                 | 8 +++++---
 cpp/test/spatial/faiss_mr.cu                              | 2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index de7ff95da7..33eb42e1e0 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -20,7 +20,7 @@
 #include "detail/ann_quantized_faiss.cuh"
 
 #include <faiss/gpu/GpuIndex.h>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index e6ff4f7869..b1430bf29b 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -19,7 +19,7 @@
 #include <raft/linalg/distance_type.h>
 
 #include <faiss/gpu/GpuIndex.h>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
@@ -30,7 +30,7 @@ struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
 
-  raft::mr::RmmGpuResources *gpu_res;
+  raft::spatial::knn::RmmGpuResources *gpu_res;
   int device;
   ~knnIndex() {
     delete index;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 34c9550404..ccbad340bf 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -28,7 +28,7 @@
 
 #include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -131,7 +131,8 @@ void approx_knn_build_index(raft::handle_t &handle,
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::RmmGpuResources *gpu_res = new raft::mr::RmmGpuResources();
+  raft::spatial::knn::RmmGpuResources *gpu_res =
+    new raft::spatial::knn::RmmGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
   index->gpu_res = gpu_res;
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index ccebea1dad..5117a26c2e 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -27,7 +27,7 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index e5bde6851a..c729fe38f0 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -32,7 +32,7 @@
 #include <cstdint>
 #include <iostream>
 #include <raft/handle.hpp>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 
 #include "fused_l2_knn.cuh"
@@ -284,7 +284,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
       default:
         faiss::MetricType m = build_faiss_metric(metric);
 
-        raft::mr::RmmGpuResources gpu_res;
+        raft::spatial::knn::RmmGpuResources gpu_res;
 
         gpu_res.noTempMemory();
         gpu_res.setDefaultStream(device, stream);
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 0bf709a937..7ea57cb98e 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/mr/faiss_mr.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
index 4473ff0cd5..17405fd643 100644
--- a/cpp/include/raft/spatial/knn/faiss_mr.hpp
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -25,7 +25,8 @@
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 
 namespace raft {
-namespace mr {
+namespace spatial {
+namespace knn {
 
 using namespace faiss::gpu;
 
@@ -641,5 +642,6 @@ class RmmGpuResources : public GpuResourcesProvider {
   std::shared_ptr<RmmGpuResourcesImpl> res_;
 };
 
-}  // namespace mr
-}  // namespace raft
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
index 00e02e5571..814e9d4bca 100644
--- a/cpp/test/spatial/faiss_mr.cu
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -60,7 +60,7 @@ class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
   }
 
   void testAllocs(MemorySpace mem_space) {
-    raft::mr::RmmGpuResources faiss_mr;
+    raft::spatial::knn::RmmGpuResources faiss_mr;
     auto faiss_mr_impl = faiss_mr.getResources();
     size_t free_before = getFreeMemory(mem_space);
     AllocRequest req(AllocType::Other, 0, mem_space, stream, params_.size);

From 80163856e3975ca654ea356a6945b09bd4e54d2d Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 26 Oct 2021 11:26:48 +0200
Subject: [PATCH 4/7] temp commit

---
 cpp/include/raft/spatial/knn/faiss_mr.hpp | 336 +---------------------
 1 file changed, 11 insertions(+), 325 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
index 17405fd643..65a5d33f83 100644
--- a/cpp/include/raft/spatial/knn/faiss_mr.hpp
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StackDeviceMemory.h>
 #include <faiss/gpu/utils/StaticUtils.h>
@@ -77,18 +78,10 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 /// RMM implementation of the GpuResources object that provides for a
 /// temporary memory manager
-class RmmGpuResourcesImpl : public GpuResources {
+class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
  public:
   RmmGpuResourcesImpl()
-    : pinnedMemAlloc_(nullptr),
-      pinnedMemAllocSize_(0),
-      // let the adjustment function determine the memory size for us by passing
-      // in a huge value that will then be adjusted
-      tempMemSize_(
-        getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
-      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-      allocLogging_(false),
-      cmr(new rmm::mr::cuda_memory_resource),
+    : cmr(new rmm::mr::cuda_memory_resource),
       mmr(new rmm::mr::managed_memory_resource),
       pmr(new rmm::mr::pinned_memory_resource){};
 
@@ -110,6 +103,7 @@ class RmmGpuResourcesImpl : public GpuResources {
         allocError = true;
       }
     }
+    allocs_.clear();
 
     FAISS_ASSERT_MSG(!allocError,
                      "GPU memory allocations not properly cleaned up");
@@ -120,6 +114,7 @@ class RmmGpuResourcesImpl : public GpuResources {
       // We created these streams, so are responsible for destroying them
       CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
+    defaultStreams_.clear();
 
     for (auto& entry : alternateStreams_) {
       DeviceScope scope(entry.first);
@@ -128,12 +123,14 @@ class RmmGpuResourcesImpl : public GpuResources {
         CUDA_VERIFY(cudaStreamDestroy(stream));
       }
     }
+    alternateStreams_.clear();
 
     for (auto& entry : asyncCopyStreams_) {
       DeviceScope scope(entry.first);
 
       CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
+    asyncCopyStreams_.clear();
 
     for (auto& entry : blasHandles_) {
       DeviceScope scope(entry.first);
@@ -141,132 +138,14 @@ class RmmGpuResourcesImpl : public GpuResources {
       auto blasStatus = cublasDestroy(entry.second);
       FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
     }
+    blasHandles_.clear();
 
     if (pinnedMemAlloc_) {
       pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
     }
+    pinnedMemAlloc_ = nullptr;
   };
 
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { setTempMemory(0); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size) {
-    if (tempMemSize_ != size) {
-      // adjust based on general limits
-      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-
-      // We need to re-initialize memory resources for all current devices that
-      // have been initialized.
-      // This should be safe to do, even if we are currently running work, because
-      // the cudaFree call that this implies will force-synchronize all GPUs with
-      // the CPU
-      for (auto& p : tempMemory_) {
-        int device = p.first;
-        // Free the existing memory first
-        p.second.reset();
-
-        // Allocate new
-        p.second = std::unique_ptr<StackDeviceMemory>(
-          new StackDeviceMemory(this, p.first,
-                                // adjust for this specific device
-                                getDefaultTempMemForGPU(device, tempMemSize_)));
-      }
-    }
-  };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size) {
-    // Should not call this after devices have been initialized
-    FAISS_ASSERT(defaultStreams_.size() == 0);
-    FAISS_ASSERT(!pinnedMemAlloc_);
-
-    pinnedMemSize_ = size;
-  };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream) {
-    if (isInitialized(device)) {
-      // A new series of calls may not be ordered with what was the previous
-      // stream, so if the stream being specified is different, then we need to
-      // ensure ordering between the two (new stream waits on old).
-      auto it = userDefaultStreams_.find(device);
-      cudaStream_t prevStream = nullptr;
-
-      if (it != userDefaultStreams_.end()) {
-        prevStream = it->second;
-      } else {
-        FAISS_ASSERT(defaultStreams_.count(device));
-        prevStream = defaultStreams_[device];
-      }
-
-      if (prevStream != stream) {
-        streamWait({stream}, {prevStream});
-      }
-    }
-
-    userDefaultStreams_[device] = stream;
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device) {
-    if (isInitialized(device)) {
-      auto it = userDefaultStreams_.find(device);
-
-      if (it != userDefaultStreams_.end()) {
-        // There was a user stream set that we need to synchronize against
-        cudaStream_t prevStream = userDefaultStreams_[device];
-
-        FAISS_ASSERT(defaultStreams_.count(device));
-        cudaStream_t newStream = defaultStreams_[device];
-
-        streamWait({newStream}, {prevStream});
-      }
-    }
-
-    userDefaultStreams_.erase(device);
-  };
-
-  /// Returns the stream for the given device on which all Faiss GPU work is
-  /// ordered.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  cudaStream_t getDefaultStream(int device) {
-    initializeForDevice(device);
-
-    auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There is a user override stream set
-      return it->second;
-    }
-
-    // Otherwise, our base default stream
-    return defaultStreams_[device];
-  };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices() {
-    for (int dev = 0; dev < getNumDevices(); ++dev) {
-      setDefaultStream(dev, nullptr);
-    }
-  };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
-
  public:
   /// Internal system calls
 
@@ -346,16 +225,6 @@ class RmmGpuResourcesImpl : public GpuResources {
     tempMemory_.emplace(device, std::move(mem));
   };
 
-  cublasHandle_t getBlasHandle(int device) {
-    initializeForDevice(device);
-    return blasHandles_[device];
-  };
-
-  std::vector<cudaStream_t> getAlternateStreams(int device) {
-    initializeForDevice(device);
-    return alternateStreams_[device];
-  };
-
   /// Allocate non-temporary GPU memory
   void* allocMemory(const AllocRequest& req) {
     initializeForDevice(req.device);
@@ -440,121 +309,6 @@ class RmmGpuResourcesImpl : public GpuResources {
     a.erase(it);
   };
 
-  size_t getTempMemoryAvailable(int device) const {
-    FAISS_ASSERT(isInitialized(device));
-
-    auto it = tempMemory_.find(device);
-    FAISS_ASSERT(it != tempMemory_.end());
-
-    return it->second->getSizeAvailable();
-  };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
-    const {
-    using AT = std::map<std::string, std::pair<int, size_t>>;
-
-    std::map<int, AT> out;
-
-    for (auto& entry : allocs_) {
-      AT outDevice;
-
-      for (auto& a : entry.second) {
-        auto& v = outDevice[allocTypeToString(a.second.type)];
-        v.first++;
-        v.second += a.second.size;
-      }
-
-      out[entry.first] = std::move(outDevice);
-    }
-
-    return out;
-  };
-
-  std::pair<void*, size_t> getPinnedMemory() {
-    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
-  };
-
-  cudaStream_t getAsyncCopyStream(int device) {
-    initializeForDevice(device);
-    return asyncCopyStreams_[device];
-  };
-
- private:
-  /// Have GPU resources been initialized for this device yet?
-  bool isInitialized(int device) const {
-    // Use default streams as a marker for whether or not a certain
-    // device has been initialized
-    return defaultStreams_.count(device) != 0;
-  };
-
-  /// Adjust the default temporary memory allocation based on the total GPU
-  /// memory size
-  static size_t getDefaultTempMemForGPU(int device, size_t requested) {
-    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
-                                 : std::numeric_limits<size_t>::max();
-
-    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-
-      if (requested > k4GiBTempMem) {
-        return k4GiBTempMem;
-      }
-    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-
-      if (requested > k8GiBTempMem) {
-        return k8GiBTempMem;
-      }
-    } else {
-      // Never use more than 1.5 GiB
-      if (requested > kMaxTempMem) {
-        return kMaxTempMem;
-      }
-    }
-
-    // use whatever lower limit the user requested
-    return requested;
-  };
-
- private:
-  /// Set of currently outstanding memory allocations per device
-  /// device -> (alloc request, allocated ptr)
-  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-
-  /// Temporary memory provider, per each device
-  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-
-  /// Our default stream that work is ordered on, one per each device
-  std::unordered_map<int, cudaStream_t> defaultStreams_;
-
-  /// This contains particular streams as set by the user for
-  /// ordering, if any
-  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-
-  /// Other streams we can use, per each device
-  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-
-  /// Async copy stream to use for GPU <-> CPU pinned memory copies
-  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-
-  /// cuBLAS handle for each device
-  std::unordered_map<int, cublasHandle_t> blasHandles_;
-
-  /// Pinned memory allocation for use with this GPU
-  void* pinnedMemAlloc_;
-  size_t pinnedMemAllocSize_;
-
-  /// Another option is to use a specified amount of memory on all
-  /// devices
-  size_t tempMemSize_;
-
-  /// Amount of pinned memory we should allocate
-  size_t pinnedMemSize_;
-
-  /// Whether or not we log every GPU memory allocation and deallocation
-  bool allocLogging_;
-
   // cuda_memory_resource
   std::unique_ptr<rmm::mr::device_memory_resource> cmr;
 
@@ -569,77 +323,9 @@ class RmmGpuResourcesImpl : public GpuResources {
 /// stream and 2 streams for use, as well as temporary memory.
 /// Internally, the Faiss GPU code uses the instance managed by getResources,
 /// but this is the user-facing object that is internally reference counted.
-class RmmGpuResources : public GpuResourcesProvider {
+class RmmGpuResources : public StandardGpuResources {
  public:
-  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
-
-  ~RmmGpuResources(){};
-
-  std::shared_ptr<GpuResources> getResources() { return res_; };
-
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { res_->noTempMemory(); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size) { res_->setTempMemory(size); };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream) {
-    res_->setDefaultStream(device, stream);
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices() {
-    res_->setDefaultNullStreamAllDevices();
-  };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
-    const {
-    return res_->getMemoryInfo();
-  };
-
-  /// Returns the current default stream
-  cudaStream_t getDefaultStream(int device) {
-    return res_->getDefaultStream(device);
-  };
-
-  /// Returns the current amount of temp memory available
-  size_t getTempMemoryAvailable(int device) const {
-    return res_->getTempMemoryAvailable(device);
-  };
-
-  /// Synchronize our default stream with the CPU
-  void syncDefaultStreamCurrentDevice() {
-    res_->syncDefaultStreamCurrentDevice();
-  };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) {
-    res_->setLogMemoryAllocations(enable);
-  };
-
- private:
-  std::shared_ptr<RmmGpuResourcesImpl> res_;
+  RmmGpuResources() { res_ = std::make_shared<RmmGpuResourcesImpl>(); };
 };
 
 }  // namespace knn

From d6e961d3631695cb588650647af5001dec315a8e Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 22 Nov 2021 17:54:05 +0100
Subject: [PATCH 5/7] Revert commit + add comment

---
 cpp/include/raft/spatial/knn/faiss_mr.hpp | 345 +++++++++++++++++++++-
 1 file changed, 334 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
index 65a5d33f83..cc91a5dcff 100644
--- a/cpp/include/raft/spatial/knn/faiss_mr.hpp
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -5,10 +5,18 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+/*
+This code contains unnecessary code duplication. These could be deleted
+once the relevant changes would be made on the FAISS side. Indeed most of
+the logic in the below code is similar to FAISS's standard implementation
+and should thus be inherited instead of duplicated. This FAISS's issue
+once solved should allow the removal of the unnecessary duplicates
+in this file : https://github.com/facebookresearch/faiss/issues/2097
+*/
+
 #pragma once
 
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StackDeviceMemory.h>
 #include <faiss/gpu/utils/StaticUtils.h>
@@ -78,10 +86,18 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 /// RMM implementation of the GpuResources object that provides for a
 /// temporary memory manager
-class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
+class RmmGpuResourcesImpl : public GpuResources {
  public:
   RmmGpuResourcesImpl()
-    : cmr(new rmm::mr::cuda_memory_resource),
+    : pinnedMemAlloc_(nullptr),
+      pinnedMemAllocSize_(0),
+      // let the adjustment function determine the memory size for us by passing
+      // in a huge value that will then be adjusted
+      tempMemSize_(
+        getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
+      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+      allocLogging_(false),
+      cmr(new rmm::mr::cuda_memory_resource),
       mmr(new rmm::mr::managed_memory_resource),
       pmr(new rmm::mr::pinned_memory_resource){};
 
@@ -103,7 +119,6 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
         allocError = true;
       }
     }
-    allocs_.clear();
 
     FAISS_ASSERT_MSG(!allocError,
                      "GPU memory allocations not properly cleaned up");
@@ -114,7 +129,6 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
       // We created these streams, so are responsible for destroying them
       CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
-    defaultStreams_.clear();
 
     for (auto& entry : alternateStreams_) {
       DeviceScope scope(entry.first);
@@ -123,14 +137,12 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
         CUDA_VERIFY(cudaStreamDestroy(stream));
       }
     }
-    alternateStreams_.clear();
 
     for (auto& entry : asyncCopyStreams_) {
       DeviceScope scope(entry.first);
 
       CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
-    asyncCopyStreams_.clear();
 
     for (auto& entry : blasHandles_) {
       DeviceScope scope(entry.first);
@@ -138,14 +150,132 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
       auto blasStatus = cublasDestroy(entry.second);
       FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
     }
-    blasHandles_.clear();
 
     if (pinnedMemAlloc_) {
       pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
     }
-    pinnedMemAlloc_ = nullptr;
   };
 
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { setTempMemory(0); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size) {
+    if (tempMemSize_ != size) {
+      // adjust based on general limits
+      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+
+      // We need to re-initialize memory resources for all current devices that
+      // have been initialized.
+      // This should be safe to do, even if we are currently running work, because
+      // the cudaFree call that this implies will force-synchronize all GPUs with
+      // the CPU
+      for (auto& p : tempMemory_) {
+        int device = p.first;
+        // Free the existing memory first
+        p.second.reset();
+
+        // Allocate new
+        p.second = std::unique_ptr<StackDeviceMemory>(
+          new StackDeviceMemory(this, p.first,
+                                // adjust for this specific device
+                                getDefaultTempMemForGPU(device, tempMemSize_)));
+      }
+    }
+  };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size) {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
+
+    pinnedMemSize_ = size;
+  };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) {
+    if (isInitialized(device)) {
+      // A new series of calls may not be ordered with what was the previous
+      // stream, so if the stream being specified is different, then we need to
+      // ensure ordering between the two (new stream waits on old).
+      auto it = userDefaultStreams_.find(device);
+      cudaStream_t prevStream = nullptr;
+
+      if (it != userDefaultStreams_.end()) {
+        prevStream = it->second;
+      } else {
+        FAISS_ASSERT(defaultStreams_.count(device));
+        prevStream = defaultStreams_[device];
+      }
+
+      if (prevStream != stream) {
+        streamWait({stream}, {prevStream});
+      }
+    }
+
+    userDefaultStreams_[device] = stream;
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device) {
+    if (isInitialized(device)) {
+      auto it = userDefaultStreams_.find(device);
+
+      if (it != userDefaultStreams_.end()) {
+        // There was a user stream set that we need to synchronize against
+        cudaStream_t prevStream = userDefaultStreams_[device];
+
+        FAISS_ASSERT(defaultStreams_.count(device));
+        cudaStream_t newStream = defaultStreams_[device];
+
+        streamWait({newStream}, {prevStream});
+      }
+    }
+
+    userDefaultStreams_.erase(device);
+  };
+
+  /// Returns the stream for the given device on which all Faiss GPU work is
+  /// ordered.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  cudaStream_t getDefaultStream(int device) {
+    initializeForDevice(device);
+
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+      // There is a user override stream set
+      return it->second;
+    }
+
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
+  };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices() {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+      setDefaultStream(dev, nullptr);
+    }
+  };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
+
  public:
   /// Internal system calls
 
@@ -225,6 +355,16 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
     tempMemory_.emplace(device, std::move(mem));
   };
 
+  cublasHandle_t getBlasHandle(int device) {
+    initializeForDevice(device);
+    return blasHandles_[device];
+  };
+
+  std::vector<cudaStream_t> getAlternateStreams(int device) {
+    initializeForDevice(device);
+    return alternateStreams_[device];
+  };
+
   /// Allocate non-temporary GPU memory
   void* allocMemory(const AllocRequest& req) {
     initializeForDevice(req.device);
@@ -309,6 +449,121 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
     a.erase(it);
   };
 
+  size_t getTempMemoryAvailable(int device) const {
+    FAISS_ASSERT(isInitialized(device));
+
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
+
+    return it->second->getSizeAvailable();
+  };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+    const {
+    using AT = std::map<std::string, std::pair<int, size_t>>;
+
+    std::map<int, AT> out;
+
+    for (auto& entry : allocs_) {
+      AT outDevice;
+
+      for (auto& a : entry.second) {
+        auto& v = outDevice[allocTypeToString(a.second.type)];
+        v.first++;
+        v.second += a.second.size;
+      }
+
+      out[entry.first] = std::move(outDevice);
+    }
+
+    return out;
+  };
+
+  std::pair<void*, size_t> getPinnedMemory() {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+  };
+
+  cudaStream_t getAsyncCopyStream(int device) {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
+  };
+
+ private:
+  /// Have GPU resources been initialized for this device yet?
+  bool isInitialized(int device) const {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
+  };
+
+  /// Adjust the default temporary memory allocation based on the total GPU
+  /// memory size
+  static size_t getDefaultTempMemForGPU(int device, size_t requested) {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
+
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+
+      if (requested > k4GiBTempMem) {
+        return k4GiBTempMem;
+      }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+
+      if (requested > k8GiBTempMem) {
+        return k8GiBTempMem;
+      }
+    } else {
+      // Never use more than 1.5 GiB
+      if (requested > kMaxTempMem) {
+        return kMaxTempMem;
+      }
+    }
+
+    // use whatever lower limit the user requested
+    return requested;
+  };
+
+ private:
+  /// Set of currently outstanding memory allocations per device
+  /// device -> (alloc request, allocated ptr)
+  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+
+  /// Temporary memory provider, per each device
+  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
+
+  /// Our default stream that work is ordered on, one per each device
+  std::unordered_map<int, cudaStream_t> defaultStreams_;
+
+  /// This contains particular streams as set by the user for
+  /// ordering, if any
+  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+
+  /// Other streams we can use, per each device
+  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
+
+  /// Async copy stream to use for GPU <-> CPU pinned memory copies
+  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+
+  /// cuBLAS handle for each device
+  std::unordered_map<int, cublasHandle_t> blasHandles_;
+
+  /// Pinned memory allocation for use with this GPU
+  void* pinnedMemAlloc_;
+  size_t pinnedMemAllocSize_;
+
+  /// Another option is to use a specified amount of memory on all
+  /// devices
+  size_t tempMemSize_;
+
+  /// Amount of pinned memory we should allocate
+  size_t pinnedMemSize_;
+
+  /// Whether or not we log every GPU memory allocation and deallocation
+  bool allocLogging_;
+
   // cuda_memory_resource
   std::unique_ptr<rmm::mr::device_memory_resource> cmr;
 
@@ -323,9 +578,77 @@ class RmmGpuResourcesImpl : public StandardGpuResourcesImpl {
 /// stream and 2 streams for use, as well as temporary memory.
 /// Internally, the Faiss GPU code uses the instance managed by getResources,
 /// but this is the user-facing object that is internally reference counted.
-class RmmGpuResources : public StandardGpuResources {
+class RmmGpuResources : public GpuResourcesProvider {
  public:
-  RmmGpuResources() { res_ = std::make_shared<RmmGpuResourcesImpl>(); };
+  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
+
+  ~RmmGpuResources(){};
+
+  std::shared_ptr<GpuResources> getResources() { return res_; };
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { res_->noTempMemory(); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size) { res_->setTempMemory(size); };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) {
+    res_->setDefaultStream(device, stream);
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices() {
+    res_->setDefaultNullStreamAllDevices();
+  };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+    const {
+    return res_->getMemoryInfo();
+  };
+
+  /// Returns the current default stream
+  cudaStream_t getDefaultStream(int device) {
+    return res_->getDefaultStream(device);
+  };
+
+  /// Returns the current amount of temp memory available
+  size_t getTempMemoryAvailable(int device) const {
+    return res_->getTempMemoryAvailable(device);
+  };
+
+  /// Synchronize our default stream with the CPU
+  void syncDefaultStreamCurrentDevice() {
+    res_->syncDefaultStreamCurrentDevice();
+  };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) {
+    res_->setLogMemoryAllocations(enable);
+  };
+
+ private:
+  std::shared_ptr<RmmGpuResourcesImpl> res_;
 };
 
 }  // namespace knn

From b9e16b36a91993441fd0bcf127fb47733752e5c8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Dec 2021 15:42:20 -0500
Subject: [PATCH 6/7] Updating style

---
 .../knn/detail/knn_brute_force_faiss.cuh      | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 7c1eb49ca8..280fc119d7 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -325,7 +325,7 @@ void brute_force_knn_impl(
       default:
         faiss::MetricType m = build_faiss_metric(metric);
 
-          raft::spatial::knn::RmmGpuResources gpu_res;
+        raft::spatial::knn::RmmGpuResources gpu_res;
 
         gpu_res.noTempMemory();
         gpu_res.setDefaultStream(device, stream);
@@ -344,47 +344,47 @@ void brute_force_knn_impl(
         args.outDistances    = out_d_ptr;
         args.outIndices      = out_i_ptr;
 
-          bfKnn(&gpu_res, args);
-      }
+        bfKnn(&gpu_res, args);
     }
   }
+}
 
-  CUDA_CHECK(cudaPeekAtLastError());
-  //  }
+CUDA_CHECK(cudaPeekAtLastError());
+//  }
 
-  // Sync internal streams if used. We don't need to
-  // sync the user stream because we'll already have
-  // fully serial execution.
-  for (int i = 0; i < n_int_streams; i++) {
-    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
-  }
+// Sync internal streams if used. We don't need to
+// sync the user stream because we'll already have
+// fully serial execution.
+for (int i = 0; i < n_int_streams; i++) {
+  CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+}
 
-  if (input.size() > 1 || translations != nullptr) {
-    // This is necessary for proper index translations. If there are
-    // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
-  }
+if (input.size() > 1 || translations != nullptr) {
+  // This is necessary for proper index translations. If there are
+  // no translations or partitions to combine, it can be skipped.
+  knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
+}
 
-  // Perform necessary post-processing
-  if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
-      metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-      metric == raft::distance::DistanceType::LpUnexpanded) {
-    /**
-     * post-processing
-     */
-    float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
-    raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
-  }
+// Perform necessary post-processing
+if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
+    metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+    metric == raft::distance::DistanceType::LpUnexpanded) {
+  /**
+   * post-processing
+   */
+  float p = 0.5;  // standard l2
+  if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
+  raft::linalg::unaryOp<float>(
+    res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+}
 
-  query_metric_processor->revert(search_items);
-  query_metric_processor->postprocess(out_D);
-  for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i]->revert(input[i]);
-  }
+query_metric_processor->revert(search_items);
+query_metric_processor->postprocess(out_D);
+for (size_t i = 0; i < input.size(); i++) {
+  metric_processors[i]->revert(input[i]);
+}
 
-  if (translations == nullptr) delete id_ranges;
+if (translations == nullptr) delete id_ranges;
 };
 
 }  // namespace detail

From face25b40b0828a0fb23b79eab61bff8e96f33a4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Dec 2021 16:36:44 -0500
Subject: [PATCH 7/7] Fixing rogue curly brace

---
 .../knn/detail/knn_brute_force_faiss.cuh      | 63 +++++++++----------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 280fc119d7..4501ccf02d 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -347,44 +347,43 @@ void brute_force_knn_impl(
         bfKnn(&gpu_res, args);
     }
   }
-}
 
-CUDA_CHECK(cudaPeekAtLastError());
-//  }
+  CUDA_CHECK(cudaPeekAtLastError());
+  //  }
 
-// Sync internal streams if used. We don't need to
-// sync the user stream because we'll already have
-// fully serial execution.
-for (int i = 0; i < n_int_streams; i++) {
-  CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
-}
+  // Sync internal streams if used. We don't need to
+  // sync the user stream because we'll already have
+  // fully serial execution.
+  for (int i = 0; i < n_int_streams; i++) {
+    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+  }
 
-if (input.size() > 1 || translations != nullptr) {
-  // This is necessary for proper index translations. If there are
-  // no translations or partitions to combine, it can be skipped.
-  knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
-}
+  if (input.size() > 1 || translations != nullptr) {
+    // This is necessary for proper index translations. If there are
+    // no translations or partitions to combine, it can be skipped.
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
+  }
 
-// Perform necessary post-processing
-if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
-    metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-    metric == raft::distance::DistanceType::LpUnexpanded) {
-  /**
-   * post-processing
-   */
-  float p = 0.5;  // standard l2
-  if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
-  raft::linalg::unaryOp<float>(
-    res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
-}
+  // Perform necessary post-processing
+  if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+      metric == raft::distance::DistanceType::LpUnexpanded) {
+    /**
+     * post-processing
+     */
+    float p = 0.5;  // standard l2
+    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
+    raft::linalg::unaryOp<float>(
+      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+  }
 
-query_metric_processor->revert(search_items);
-query_metric_processor->postprocess(out_D);
-for (size_t i = 0; i < input.size(); i++) {
-  metric_processors[i]->revert(input[i]);
-}
+  query_metric_processor->revert(search_items);
+  query_metric_processor->postprocess(out_D);
+  for (size_t i = 0; i < input.size(); i++) {
+    metric_processors[i]->revert(input[i]);
+  }
 
-if (translations == nullptr) delete id_ranges;
+  if (translations == nullptr) delete id_ranges;
 };
 
 }  // namespace detail