diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index 0581055647..407f40f818 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -36,6 +36,7 @@ ] FILES_TO_EXCLUDE = [ re.compile(r"cpp/src/tsne/cannylab/bh\.cu"), + re.compile(r"cpp/src/hdbscan/detail/faiss_mr\.hpp"), ] # this will break starting at year 10000, which is probably OK :) diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml index cc4d598fb7..a6c43f9646 100644 --- a/conda/recipes/libcuml/meta.yaml +++ b/conda/recipes/libcuml/meta.yaml @@ -59,6 +59,8 @@ requirements: - libraft-headers ={{ minor_version }} - libraft-nn ={{ minor_version }} - treelite {{ treelite_version }} + - libfaiss>=1.7.1 + - faiss-proc=*=cuda outputs: - name: libcuml diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5025b05fde..cc687dcedc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -67,10 +67,13 @@ option(CUML_USE_FAISS_STATIC "Build and statically link the FAISS library for ne option(CUML_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF) option(CUML_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libcuml++" OFF) option(CUML_USE_CUMLPRIMS_MG_STATIC "Build and statically link the cumlprims_mg library" OFF) +option(CUML_ENABLE_NN_DEPENDENCIES "Whether to enable FAISS dependency" ON) + # The options below allow incorporating libcuml into another build process # without installing all its components. This is useful if total file size is # at a premium and we do not expect other consumers to use any APIs of the # dependency except those that are directly linked to by the dependent library. +option(CUML_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' target" ON) option(CUML_EXCLUDE_RAFT_FROM_ALL "Exclude RAFT targets from cuML's 'all' target" OFF) option(CUML_EXCLUDE_TREELITE_FROM_ALL "Exclude Treelite targets from cuML's 'all' target" OFF) option(CUML_EXCLUDE_CUMLPRIMS_MG_FROM_ALL "Exclude cumlprims_mg targets from cuML's 'all' target" OFF) @@ -220,6 +223,10 @@ endif() include(cmake/thirdparty/get_raft.cmake) +if(CUML_USE_RAFT_NN) + include(cmake/thirdparty/get_faiss.cmake) +endif() + if(LINK_TREELITE) include(cmake/thirdparty/get_treelite.cmake) endif() @@ -556,10 +563,6 @@ if(BUILD_CUML_CPP_LIBRARY) if(CUML_USE_RAFT_STATIC AND (TARGET raft::raft)) copy_interface_excludes(INCLUDED_TARGET raft::raft TARGET ${CUML_CPP_TARGET}) - if(CUML_USE_RAFT_NN AND (TARGET faiss::faiss)) - copy_interface_excludes(INCLUDED_TARGET faiss::faiss TARGET ${CUML_CPP_TARGET}) - endif() - if(CUML_USE_RAFT_DIST AND (TARGET cuco::cuco)) list(APPEND _cuml_cpp_private_libs cuco::cuco) endif() @@ -602,6 +605,7 @@ if(BUILD_CUML_CPP_LIBRARY) # INTERFACE target. list(APPEND ${_cuml_cpp_libs_var_name} raft::raft + $<$:faiss> $<$:raft::nn> $<$:raft::distance> $ diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 7fbb3da70b..870d0b0120 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,6 +47,7 @@ if(BUILD_CUML_BENCH) ${TREELITE_LIBS} raft::raft raft::nn + faiss::faiss raft::distance ) diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake new file mode 100644 index 0000000000..c4ef84f500 --- /dev/null +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -0,0 +1,89 @@ +#============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_faiss) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + if(CUML_USE_RAFT_NN) + rapids_find_generate_module(faiss + HEADER_NAMES faiss/IndexFlat.h + LIBRARY_NAMES faiss + ) + + set(BUILD_SHARED_LIBS ON) + if (PKG_BUILD_STATIC_LIBS) + set(BUILD_SHARED_LIBS OFF) + set(CPM_DOWNLOAD_faiss ON) + endif() + + rapids_cpm_find(faiss ${PKG_VERSION} + GLOBAL_TARGETS faiss::faiss + CPM_ARGS + GIT_REPOSITORY ${PKG_REPOSITORY} + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + OPTIONS + "FAISS_ENABLE_PYTHON OFF" + "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}" + "FAISS_ENABLE_GPU ON" + "BUILD_TESTING OFF" + "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" + "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" + ) + + if(TARGET faiss AND NOT TARGET faiss::faiss) + add_library(faiss::faiss ALIAS faiss) + endif() + + if(faiss_ADDED) + rapids_export(BUILD faiss + EXPORT_SET faiss-targets + GLOBAL_TARGETS faiss + NAMESPACE faiss::) + endif() + endif() + + # We generate the faiss-config files when we built faiss locally, so always do `find_dependency` + rapids_export_package(BUILD OpenMP cuml-exports) # faiss uses openMP but doesn't export a need for it + rapids_export_package(BUILD faiss cuml-exports GLOBAL_TARGETS faiss::faiss faiss) + rapids_export_package(INSTALL faiss cuml-exports GLOBAL_TARGETS faiss::faiss faiss) + + # Tell cmake where it can find the generated faiss-config.cmake we wrote. + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] cuml-exports) +endfunction() + +if(NOT CUML_FAISS_GIT_TAG) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(CUML_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0) + # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30) +endif() + +if(NOT CUML_FAISS_GIT_REPOSITORY) + # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC + # (https://github.com/facebookresearch/faiss/pull/2446) + set(CUML_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git) + # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git) +endif() + +find_and_configure_faiss(VERSION 1.7.0 + REPOSITORY ${CUML_FAISS_GIT_REPOSITORY} + PINNED_TAG ${CUML_FAISS_GIT_TAG} + BUILD_STATIC_LIBS ${CUML_USE_FAISS_STATIC} + EXCLUDE_FROM_ALL ${CUML_EXCLUDE_FAISS_FROM_ALL}) \ No newline at end of file diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index a890a5e72e..264d4fae15 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,30 +23,30 @@ function(find_and_configure_raft) "${multiValueArgs}" ${ARGN} ) if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${CUML_BRANCH_VERSION_raft}") - message(STATUS "CUML: RAFT pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.") - set(CPM_DOWNLOAD_raft ON) + message(STATUS "CUML: RAFT pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.") + set(CPM_DOWNLOAD_raft ON) elseif(PKG_USE_RAFT_STATIC AND (NOT CPM_raft_SOURCE)) - message(STATUS "CUML: Cloning raft locally to build static libraries.") - set(CPM_DOWNLOAD_raft ON) + message(STATUS "CUML: Cloning raft locally to build static libraries.") + set(CPM_DOWNLOAD_raft ON) endif() if(PKG_USE_RAFT_DIST) - string(APPEND RAFT_COMPONENTS "distance") + string(APPEND RAFT_COMPONENTS "distance") endif() if(PKG_USE_RAFT_NN) - string(APPEND RAFT_COMPONENTS " nn") + string(APPEND RAFT_COMPONENTS " nn") endif() # We need RAFT::distributed for MG tests if(BUILD_CUML_MG_TESTS) - string(APPEND RAFT_COMPONENTS " distributed") + string(APPEND RAFT_COMPONENTS " distributed") endif() if(PKG_USE_RAFT_DIST AND PKG_USE_RAFT_NN) - set(RAFT_COMPILE_LIBRARIES ON) + set(RAFT_COMPILE_LIBRARIES ON) else() - set(RAFT_COMPILE_LIBRARIES OFF) + set(RAFT_COMPILE_LIBRARIES OFF) endif() # We need to set this each time so that on subsequent calls to cmake @@ -55,29 +55,29 @@ function(find_and_configure_raft) set(RAFT_BUILD_SHARED_LIBS ON) if(${PKG_USE_RAFT_STATIC}) - set(RAFT_BUILD_SHARED_LIBS OFF) + set(RAFT_BUILD_SHARED_LIBS OFF) endif() message(VERBOSE "CUML: raft FIND_PACKAGE_ARGUMENTS COMPONENTS ${RAFT_COMPONENTS}") rapids_cpm_find(raft ${PKG_VERSION} - GLOBAL_TARGETS raft::raft - BUILD_EXPORT_SET cuml-exports - INSTALL_EXPORT_SET cuml-exports - COMPONENTS ${RAFT_COMPONENTS} - CPM_ARGS - GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git - GIT_TAG ${PKG_PINNED_TAG} - SOURCE_SUBDIR cpp - EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} - OPTIONS - "BUILD_TESTS OFF" - "BUILD_SHARED_LIBS ${RAFT_BUILD_SHARED_LIBS}" - "RAFT_COMPILE_LIBRARIES ${RAFT_COMPILE_LIBRARIES}" - "RAFT_COMPILE_NN_LIBRARY ${PKG_USE_RAFT_NN}" - "RAFT_COMPILE_DIST_LIBRARY ${PKG_USE_RAFT_DIST}" - "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}" - ) + GLOBAL_TARGETS raft::raft + BUILD_EXPORT_SET cuml-exports + INSTALL_EXPORT_SET cuml-exports + COMPONENTS ${RAFT_COMPONENTS} + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git + GIT_TAG ${PKG_PINNED_TAG} + SOURCE_SUBDIR cpp + EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} + OPTIONS + "BUILD_TESTS OFF" + "BUILD_SHARED_LIBS ${RAFT_BUILD_SHARED_LIBS}" + "RAFT_COMPILE_LIBRARIES ${RAFT_COMPILE_LIBRARIES}" + "RAFT_COMPILE_NN_LIBRARY ${PKG_USE_RAFT_NN}" + "RAFT_COMPILE_DIST_LIBRARY ${PKG_USE_RAFT_DIST}" + "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}" + ) if(raft_ADDED) message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}") @@ -92,16 +92,16 @@ endfunction() # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} - FORK rapidsai - PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} - EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} - # When PINNED_TAG above doesn't match cuml, - # force local raft clone in build directory - # even if it's already installed. - CLONE_ON_PIN ${CUML_RAFT_CLONE_ON_PIN} - USE_RAFT_NN ${CUML_USE_RAFT_NN} - USE_RAFT_DIST ${CUML_USE_RAFT_DIST} - USE_RAFT_STATIC ${CUML_USE_RAFT_STATIC} - USE_FAISS_STATIC ${CUML_USE_FAISS_STATIC} - NVTX ${NVTX} - ) + FORK rapidsai + PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} + EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} + # When PINNED_TAG above doesn't match cuml, + # force local raft clone in build directory + # even if it's already installed. + CLONE_ON_PIN ${CUML_RAFT_CLONE_ON_PIN} + USE_RAFT_NN ${CUML_USE_RAFT_NN} + USE_RAFT_DIST ${CUML_USE_RAFT_DIST} + USE_RAFT_STATIC ${CUML_USE_RAFT_STATIC} + USE_FAISS_STATIC ${CUML_USE_FAISS_STATIC} + NVTX ${NVTX} + ) \ No newline at end of file diff --git a/cpp/src/hdbscan/detail/faiss_mr.hpp b/cpp/src/hdbscan/detail/faiss_mr.hpp new file mode 100644 index 0000000000..6aeaf35b94 --- /dev/null +++ b/cpp/src/hdbscan/detail/faiss_mr.hpp @@ -0,0 +1,642 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* +This code contains unnecessary code duplication. These could be deleted +once the relevant changes would be made on the FAISS side. Indeed most of +the logic in the below code is similar to FAISS's standard implementation +and should thus be inherited instead of duplicated. This FAISS's issue +once solved should allow the removal of the unnecessary duplicates +in this file : https://github.com/facebookresearch/faiss/issues/2097 +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace ML { +namespace HDBSCAN { +namespace detail { +namespace Reachability { + +using namespace faiss::gpu; + +namespace { + +// How many streams per device we allocate by default (for multi-streaming) +constexpr int kNumStreams = 2; + +// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default +constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024; + +// Default temporary memory allocation for <= 4 GiB memory GPUs +constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024; + +// Default temporary memory allocation for <= 8 GiB memory GPUs +constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024; + +// Maximum temporary memory allocation for all GPUs +constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024; + +std::string allocsToString(const std::unordered_map& map) +{ + // Produce a sorted list of all outstanding allocations by type + std::unordered_map> stats; + + for (auto& entry : map) { + auto& a = entry.second; + + auto it = stats.find(a.type); + if (it != stats.end()) { + stats[a.type].first++; + stats[a.type].second += a.size; + } else { + stats[a.type] = std::make_pair(1, a.size); + } + } + + std::stringstream ss; + for (auto& entry : stats) { + ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first + << " allocations, " << entry.second.second << " bytes\n"; + } + + return ss.str(); +} + +} // namespace + +/// RMM implementation of the GpuResources object that provides for a +/// temporary memory manager +class RmmGpuResourcesImpl : public GpuResources { + public: + RmmGpuResourcesImpl() + : pinnedMemAlloc_(nullptr), + pinnedMemAllocSize_(0), + // let the adjustment function determine the memory size for us by passing + // in a huge value that will then be adjusted + tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits::max())), + pinnedMemSize_(kDefaultPinnedMemoryAllocation), + allocLogging_(false), + cmr(new rmm::mr::cuda_memory_resource), + mmr(new rmm::mr::managed_memory_resource), + pmr(new rmm::mr::pinned_memory_resource){}; + + ~RmmGpuResourcesImpl() + { + // The temporary memory allocator has allocated memory through us, so clean + // that up before we finish fully de-initializing ourselves + tempMemory_.clear(); + + // Make sure all allocations have been freed + bool allocError = false; + + for (auto& entry : allocs_) { + auto& map = entry.second; + + if (!map.empty()) { + std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n" + << "Device " << entry.first << " outstanding allocations:\n"; + std::cerr << allocsToString(map); + allocError = true; + } + } + + FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up"); + + for (auto& entry : defaultStreams_) { + DeviceScope scope(entry.first); + + // We created these streams, so are responsible for destroying them + CUDA_VERIFY(cudaStreamDestroy(entry.second)); + } + + for (auto& entry : alternateStreams_) { + DeviceScope scope(entry.first); + + for (auto stream : entry.second) { + CUDA_VERIFY(cudaStreamDestroy(stream)); + } + } + + for (auto& entry : asyncCopyStreams_) { + DeviceScope scope(entry.first); + + CUDA_VERIFY(cudaStreamDestroy(entry.second)); + } + + for (auto& entry : blasHandles_) { + DeviceScope scope(entry.first); + + auto blasStatus = cublasDestroy(entry.second); + FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); + } + + if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); } + }; + + /// Disable allocation of temporary memory; all temporary memory + /// requests will call cudaMalloc / cudaFree at the point of use + void noTempMemory() { setTempMemory(0); }; + + /// Specify that we wish to use a certain fixed size of memory on + /// all devices as temporary memory. This is the upper bound for the GPU + /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; + /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. + /// To avoid any temporary memory allocation, pass 0. + void setTempMemory(size_t size) + { + if (tempMemSize_ != size) { + // adjust based on general limits + tempMemSize_ = getDefaultTempMemForGPU(-1, size); + + // We need to re-initialize memory resources for all current devices that + // have been initialized. + // This should be safe to do, even if we are currently running work, because + // the cudaFree call that this implies will force-synchronize all GPUs with + // the CPU + for (auto& p : tempMemory_) { + int device = p.first; + // Free the existing memory first + p.second.reset(); + + // Allocate new + p.second = std::unique_ptr( + new StackDeviceMemory(this, + p.first, + // adjust for this specific device + getDefaultTempMemForGPU(device, tempMemSize_))); + } + } + }; + + /// Set amount of pinned memory to allocate, for async GPU <-> CPU + /// transfers + void setPinnedMemory(size_t size) + { + // Should not call this after devices have been initialized + FAISS_ASSERT(defaultStreams_.size() == 0); + FAISS_ASSERT(!pinnedMemAlloc_); + + pinnedMemSize_ = size; + }; + + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + void setDefaultStream(int device, cudaStream_t stream) + { + if (isInitialized(device)) { + // A new series of calls may not be ordered with what was the previous + // stream, so if the stream being specified is different, then we need to + // ensure ordering between the two (new stream waits on old). + auto it = userDefaultStreams_.find(device); + cudaStream_t prevStream = nullptr; + + if (it != userDefaultStreams_.end()) { + prevStream = it->second; + } else { + FAISS_ASSERT(defaultStreams_.count(device)); + prevStream = defaultStreams_[device]; + } + + if (prevStream != stream) { streamWait({stream}, {prevStream}); } + } + + userDefaultStreams_[device] = stream; + }; + + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device) + { + if (isInitialized(device)) { + auto it = userDefaultStreams_.find(device); + + if (it != userDefaultStreams_.end()) { + // There was a user stream set that we need to synchronize against + cudaStream_t prevStream = userDefaultStreams_[device]; + + FAISS_ASSERT(defaultStreams_.count(device)); + cudaStream_t newStream = defaultStreams_[device]; + + streamWait({newStream}, {prevStream}); + } + } + + userDefaultStreams_.erase(device); + }; + + /// Returns the stream for the given device on which all Faiss GPU work is + /// ordered. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + cudaStream_t getDefaultStream(int device) + { + initializeForDevice(device); + + auto it = userDefaultStreams_.find(device); + if (it != userDefaultStreams_.end()) { + // There is a user override stream set + return it->second; + } + + // Otherwise, our base default stream + return defaultStreams_[device]; + }; + + /// Called to change the work ordering streams to the null stream + /// for all devices + void setDefaultNullStreamAllDevices() + { + for (int dev = 0; dev < getNumDevices(); ++dev) { + setDefaultStream(dev, nullptr); + } + }; + + /// If enabled, will print every GPU memory allocation and deallocation to + /// standard output + void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; }; + + public: + /// Internal system calls + + /// Initialize resources for this device + void initializeForDevice(int device) + { + if (isInitialized(device)) { return; } + + // If this is the first device that we're initializing, create our + // pinned memory allocation + if (defaultStreams_.empty() && pinnedMemSize_ > 0) { + pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_); + pinnedMemAllocSize_ = pinnedMemSize_; + } + + FAISS_ASSERT(device < getNumDevices()); + DeviceScope scope(device); + + // Make sure that device properties for all devices are cached + auto& prop = getDeviceProperties(device); + + // Also check to make sure we meet our minimum compute capability (3.0) + FAISS_ASSERT_FMT(prop.major >= 3, + "Device id %d with CC %d.%d not supported, " + "need 3.0+ compute capability", + device, + prop.major, + prop.minor); + + // Create streams + cudaStream_t defaultStream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking)); + + defaultStreams_[device] = defaultStream; + + cudaStream_t asyncCopyStream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking)); + + asyncCopyStreams_[device] = asyncCopyStream; + + std::vector deviceStreams; + for (int j = 0; j < kNumStreams; ++j) { + cudaStream_t stream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + deviceStreams.push_back(stream); + } + + alternateStreams_[device] = std::move(deviceStreams); + + // Create cuBLAS handle + cublasHandle_t blasHandle = 0; + auto blasStatus = cublasCreate(&blasHandle); + FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); + blasHandles_[device] = blasHandle; + + // For CUDA 10 on V100, enabling tensor core usage would enable automatic + // rounding down of inputs to f16 (though accumulate in f32) which results in + // unacceptable loss of precision in general. + // For CUDA 11 / A100, only enable tensor core support if it doesn't result in + // a loss of precision. +#if CUDA_VERSION >= 11000 + cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +#endif + + FAISS_ASSERT(allocs_.count(device) == 0); + allocs_[device] = std::unordered_map(); + + FAISS_ASSERT(tempMemory_.count(device) == 0); + auto mem = std::unique_ptr( + new StackDeviceMemory(this, + device, + // adjust for this specific device + getDefaultTempMemForGPU(device, tempMemSize_))); + + tempMemory_.emplace(device, std::move(mem)); + }; + + cublasHandle_t getBlasHandle(int device) + { + initializeForDevice(device); + return blasHandles_[device]; + }; + + std::vector getAlternateStreams(int device) + { + initializeForDevice(device); + return alternateStreams_[device]; + }; + + /// Allocate non-temporary GPU memory + void* allocMemory(const AllocRequest& req) + { + initializeForDevice(req.device); + + // We don't allocate a placeholder for zero-sized allocations + if (req.size == 0) { return nullptr; } + + // Make sure that the allocation is a multiple of 16 bytes for alignment + // purposes + auto adjReq = req; + adjReq.size = utils::roundUp(adjReq.size, (size_t)16); + + void* p = nullptr; + + if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; } + + if (adjReq.space == MemorySpace::Temporary) { + // If we don't have enough space in our temporary memory manager, we need + // to allocate this request separately + auto& tempMem = tempMemory_[adjReq.device]; + + if (adjReq.size > tempMem->getSizeAvailable()) { + // We need to allocate this ourselves + AllocRequest newReq = adjReq; + newReq.space = MemorySpace::Device; + newReq.type = AllocType::TemporaryMemoryOverflow; + + return allocMemory(newReq); + } + + // Otherwise, we can handle this locally + p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size); + + } else if (adjReq.space == MemorySpace::Device) { + p = cmr->allocate(adjReq.size, adjReq.stream); + } else if (adjReq.space == MemorySpace::Unified) { + p = mmr->allocate(adjReq.size, adjReq.stream); + } else { + FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space); + } + + allocs_[adjReq.device][p] = adjReq; + + return p; + }; + + /// Returns a previous allocation + void deallocMemory(int device, void* p) + { + FAISS_ASSERT(isInitialized(device)); + + if (!p) { return; } + + auto& a = allocs_[device]; + auto it = a.find(p); + FAISS_ASSERT(it != a.end()); + + auto& req = it->second; + + if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; } + + if (req.space == MemorySpace::Temporary) { + tempMemory_[device]->deallocMemory(device, req.stream, req.size, p); + } else if (req.space == MemorySpace::Device) { + cmr->deallocate(p, req.size, req.stream); + } else if (req.space == MemorySpace::Unified) { + mmr->deallocate(p, req.size, req.stream); + } else { + FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space); + } + + a.erase(it); + }; + + size_t getTempMemoryAvailable(int device) const + { + FAISS_ASSERT(isInitialized(device)); + + auto it = tempMemory_.find(device); + FAISS_ASSERT(it != tempMemory_.end()); + + return it->second->getSizeAvailable(); + }; + + /// Export a description of memory used for Python + std::map>> getMemoryInfo() const + { + using AT = std::map>; + + std::map out; + + for (auto& entry : allocs_) { + AT outDevice; + + for (auto& a : entry.second) { + auto& v = outDevice[allocTypeToString(a.second.type)]; + v.first++; + v.second += a.second.size; + } + + out[entry.first] = std::move(outDevice); + } + + return out; + }; + + std::pair getPinnedMemory() + { + return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_); + }; + + cudaStream_t getAsyncCopyStream(int device) + { + initializeForDevice(device); + return asyncCopyStreams_[device]; + }; + + private: + /// Have GPU resources been initialized for this device yet? + bool isInitialized(int device) const + { + // Use default streams as a marker for whether or not a certain + // device has been initialized + return defaultStreams_.count(device) != 0; + }; + + /// Adjust the default temporary memory allocation based on the total GPU + /// memory size + static size_t getDefaultTempMemForGPU(int device, size_t requested) + { + auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem + : std::numeric_limits::max(); + + if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) { + // If the GPU has <= 4 GiB of memory, reserve 512 MiB + + if (requested > k4GiBTempMem) { return k4GiBTempMem; } + } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) { + // If the GPU has <= 8 GiB of memory, reserve 1 GiB + + if (requested > k8GiBTempMem) { return k8GiBTempMem; } + } else { + // Never use more than 1.5 GiB + if (requested > kMaxTempMem) { return kMaxTempMem; } + } + + // use whatever lower limit the user requested + return requested; + }; + + private: + /// Set of currently outstanding memory allocations per device + /// device -> (alloc request, allocated ptr) + std::unordered_map> allocs_; + + /// Temporary memory provider, per each device + std::unordered_map> tempMemory_; + + /// Our default stream that work is ordered on, one per each device + std::unordered_map defaultStreams_; + + /// This contains particular streams as set by the user for + /// ordering, if any + std::unordered_map userDefaultStreams_; + + /// Other streams we can use, per each device + std::unordered_map> alternateStreams_; + + /// Async copy stream to use for GPU <-> CPU pinned memory copies + std::unordered_map asyncCopyStreams_; + + /// cuBLAS handle for each device + std::unordered_map blasHandles_; + + /// Pinned memory allocation for use with this GPU + void* pinnedMemAlloc_; + size_t pinnedMemAllocSize_; + + /// Another option is to use a specified amount of memory on all + /// devices + size_t tempMemSize_; + + /// Amount of pinned memory we should allocate + size_t pinnedMemSize_; + + /// Whether or not we log every GPU memory allocation and deallocation + bool allocLogging_; + + // cuda_memory_resource + std::unique_ptr cmr; + + // managed_memory_resource + std::unique_ptr mmr; + + // pinned_memory_resource + std::unique_ptr pmr; +}; + +/// Default implementation of GpuResources that allocates a cuBLAS +/// stream and 2 streams for use, as well as temporary memory. +/// Internally, the Faiss GPU code uses the instance managed by getResources, +/// but this is the user-facing object that is internally reference counted. +class RmmGpuResources : public GpuResourcesProvider { + public: + RmmGpuResources() : res_(new RmmGpuResourcesImpl){}; + + ~RmmGpuResources(){}; + + std::shared_ptr getResources() { return res_; }; + + /// Disable allocation of temporary memory; all temporary memory + /// requests will call cudaMalloc / cudaFree at the point of use + void noTempMemory() { res_->noTempMemory(); }; + + /// Specify that we wish to use a certain fixed size of memory on + /// all devices as temporary memory. This is the upper bound for the GPU + /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; + /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. + /// To avoid any temporary memory allocation, pass 0. + void setTempMemory(size_t size) { res_->setTempMemory(size); }; + + /// Set amount of pinned memory to allocate, for async GPU <-> CPU + /// transfers + void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); }; + + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + void setDefaultStream(int device, cudaStream_t stream) + { + res_->setDefaultStream(device, stream); + }; + + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device) { res_->revertDefaultStream(device); }; + + /// Called to change the work ordering streams to the null stream + /// for all devices + void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); }; + + /// Export a description of memory used for Python + std::map>> getMemoryInfo() const + { + return res_->getMemoryInfo(); + }; + + /// Returns the current default stream + cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); }; + + /// Returns the current amount of temp memory available + size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); }; + + /// Synchronize our default stream with the CPU + void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); }; + + /// If enabled, will print every GPU memory allocation and deallocation to + /// standard output + void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); }; + + private: + std::shared_ptr res_; +}; + +} // namespace Reachability +} // namespace detail +} // namespace HDBSCAN +} // namespace ML diff --git a/cpp/src/hdbscan/detail/reachability_faiss.cuh b/cpp/src/hdbscan/detail/reachability_faiss.cuh index c2d970727b..255bbbc98a 100644 --- a/cpp/src/hdbscan/detail/reachability_faiss.cuh +++ b/cpp/src/hdbscan/detail/reachability_faiss.cuh @@ -6,7 +6,7 @@ */ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,13 +39,14 @@ #include #include -#include #include #include #include +#include "faiss_mr.hpp" + namespace ML { namespace HDBSCAN { namespace detail { @@ -212,7 +213,7 @@ void mutual_reachability_knn_l2(const raft::handle_t& handle, auto stream = handle.get_stream(); faiss::gpu::DeviceScope ds(device); - raft::spatial::knn::RmmGpuResources res; + RmmGpuResources res; res.noTempMemory(); res.setDefaultStream(device, stream); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 3518dcb113..acfdc65303 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,6 +44,7 @@ function(ConfigureTest) $<$:CUDA::cufft${_ctk_static_suffix_cufft}> rmm::rmm raft::raft + $<$:faiss::faiss> $<$:raft::nn> $<$:raft::distance> GTest::gtest