From d0601afd260b2103a56d86737f3f0c1ba3cc0983 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 19 Apr 2022 19:21:03 -0400
Subject: [PATCH] Making cuco, thrust, and mdspan optional dependencies. (#585)

In addition to the cuco dependency, the following changes are included:
1. ability to turn off thrust and mdspan dependencies (rmm is still required)
2. compiling libraries now defaults to the same setting of `BUILD_TESTS` (tests are still enabled)
3. cuco dependency is disabled by default (unless distance component is enabled)
4. the headers which are safe to expose in public APIs are moved over to `core/` directory.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Robert Maynard (https://github.com/robertmaynard)
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/raft/pull/585
---
 BUILD.md                                      |  45 +-
 README.md                                     |   2 +-
 build.sh                                      |  15 +-
 conda/recipes/libraft_distance/meta.yaml      |   1 -
 conda/recipes/libraft_headers/build.sh        |   2 +-
 conda/recipes/libraft_headers/meta.yaml       |   1 -
 conda/recipes/libraft_nn/meta.yaml            |   1 -
 cpp/CMakeLists.txt                            |  55 +-
 cpp/cmake/thirdparty/get_cuco.cmake           |  31 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |   6 +-
 cpp/cmake/thirdparty/get_mdspan.cmake         |  26 +-
 cpp/cmake/thirdparty/get_thrust.cmake         |  12 +-
 cpp/include/raft/common/logger.hpp            | 282 +-------
 cpp/include/raft/common/nvtx.hpp              | 141 +---
 cpp/include/raft/comms/comms.hpp              | 624 +----------------
 cpp/include/raft/core/comms.hpp               | 633 +++++++++++++++++
 cpp/include/raft/core/cublas_macros.hpp       | 121 ++++
 cpp/include/raft/core/cudart_utils.hpp        | 428 ++++++++++++
 cpp/include/raft/core/cusolver_macros.hpp     | 119 ++++
 cpp/include/raft/core/cusparse_macros.hpp     | 123 ++++
 cpp/include/raft/core/error.hpp               | 176 +++++
 cpp/include/raft/core/handle.hpp              | 339 +++++++++
 cpp/include/raft/core/interruptible.hpp       | 271 ++++++++
 cpp/include/raft/core/logger.hpp              | 298 ++++++++
 cpp/include/raft/core/mdarray.hpp             | 650 ++++++++++++++++++
 cpp/include/raft/core/nvtx.hpp                | 155 +++++
 cpp/include/raft/core/span.hpp                | 282 ++++++++
 cpp/include/raft/cudart_utils.h               | 406 +----------
 cpp/include/raft/error.hpp                    | 162 +----
 cpp/include/raft/handle.hpp                   | 324 +--------
 cpp/include/raft/interruptible.hpp            | 257 +------
 cpp/include/raft/linalg/cublas_macros.h       | 107 +--
 cpp/include/raft/linalg/cusolver_macros.h     | 103 +--
 .../raft/linalg/detail/cublas_wrappers.hpp    |  97 +--
 .../raft/linalg/detail/cusolver_wrappers.hpp  |  93 +--
 cpp/include/raft/mdarray.hpp                  | 635 +----------------
 cpp/include/raft/span.hpp                     | 267 +------
 .../raft/sparse/detail/cusparse_macros.h      | 108 +--
 .../spatial/knn/detail/selection_faiss.cuh    |   2 +-
 cpp/test/CMakeLists.txt                       |   1 -
 cpp/test/spatial/fused_l2_knn.cu              |   4 +
 docs/source/cuda_cpp.rst                      |   2 +-
 docs/source/index.rst                         |   2 +-
 43 files changed, 3767 insertions(+), 3642 deletions(-)
 create mode 100644 cpp/include/raft/core/comms.hpp
 create mode 100644 cpp/include/raft/core/cublas_macros.hpp
 create mode 100644 cpp/include/raft/core/cudart_utils.hpp
 create mode 100644 cpp/include/raft/core/cusolver_macros.hpp
 create mode 100644 cpp/include/raft/core/cusparse_macros.hpp
 create mode 100644 cpp/include/raft/core/error.hpp
 create mode 100644 cpp/include/raft/core/handle.hpp
 create mode 100644 cpp/include/raft/core/interruptible.hpp
 create mode 100644 cpp/include/raft/core/logger.hpp
 create mode 100644 cpp/include/raft/core/mdarray.hpp
 create mode 100644 cpp/include/raft/core/nvtx.hpp
 create mode 100644 cpp/include/raft/core/span.hpp

diff --git a/BUILD.md b/BUILD.md
index ef2d1a2bda..c4d8b1b356 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -26,12 +26,12 @@
 In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org).
 
 #### Required
-- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
 - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
-- [mdspan](https://github.com/rapidsai/mdspan)
   
 #### Optional
-- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
+- [mdspan](https://github.com/rapidsai/mdspan) - On by default but can be disabled. 
+- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub) - On by default but can be disabled.
+- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API.
 - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests.
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
@@ -53,6 +53,11 @@ The following example will download the needed dependencies and install the RAFT
 ./build.sh libraft --install
 ```
 
+The `--minimal-deps` flag can be used to install the headers with minimal dependencies:
+```bash
+./build.sh libraft --install --minimal-deps
+```
+
 ### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
 For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
@@ -69,7 +74,14 @@ Add the `--install` flag to the above example to also install the shared librari
 
 ### <a id="gtests"></a>Tests
 
-Compile the tests using the `tests` target in `build.sh`. By default, the shared libraries are assumed to be already built and on the library path. Add `--compile-libs` to also compile them.
+Compile the tests using the `tests` target in `build.sh`.
+
+```bash
+./build.sh libraft tests
+```
+
+Test compile times can be improved significantly by using the optional shared libraries. If installed, they will be used automatically when building the tests but `--compile-libs` can be used to add additional compilation units and compile them with the tests.
+
 ```bash
 ./build.sh libraft tests --compile-libs
 ```
@@ -110,11 +122,13 @@ RAFT's cmake has the following configurable flags available:.
 | --- | --- | --- | --- |
 | BUILD_TESTS | ON, OFF | ON | Compile Googletests |
 | BUILD_BENCH | ON, OFF | ON | Compile benchmarks |
+| raft_FIND_COMPONENTS | nn distance | | Configures the optional components as a space-separated list |
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library |
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
-| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | ON | Enables the cuCollections dependency used by `raft::sparse::distance` |
+| RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. |
+| RAFT_ENABLE_mdspan_DEPENDENCY | ON, OFF | ON | Enables the std::mdspan dependency. This can be disabled when using many simple utilities. |
 | RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` |
 | RAFT_ENABLE_ucx_DEPENDENCY | ON, OFF | OFF | Enables UCX dependency used by `raft::comms` and needed to build `pyraft` |
 | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
@@ -212,7 +226,8 @@ set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
           COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
-          USE_NN_LIBRARY USE_DISTANCE_LIBRARY)
+          USE_NN_LIBRARY USE_DISTANCE_LIBRARY 
+          ENABLE_thrust_DEPENDENCY ENABLE_mdspan_DEPENDENCY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
@@ -256,6 +271,8 @@ function(find_and_configure_raft)
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
+          "RAFT_ENABLE_thrust_DEPENDENCY ${PKG_ENABLE_thrust_DEPENDENCY}"
+          "RAFT_ENABLE_mdspan_DEPENDENCY ${PKG_ENABLE_mdspan_DEPENDENCY}"
   )
 
 endfunction()
@@ -272,11 +289,13 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         # even if it's already installed.
         CLONE_ON_PIN     ON
 
-        COMPILE_LIBRARIES      NO
-        USE_NN_LIBRARY         NO
-        USE_DISTANCE_LIBRARY   NO
-        ENABLE_NN_DEPENDENCIES NO  # This builds FAISS if not installed
-        USE_FAISS_STATIC       NO
+        COMPILE_LIBRARIES        NO
+        USE_NN_LIBRARY           NO
+        USE_DISTANCE_LIBRARY     NO
+        ENABLE_NN_DEPENDENCIES   NO  # This builds FAISS if not installed
+        USE_FAISS_STATIC         NO
+        ENABLE_thrust_DEPENDENCY YES
+        ENABLE_mdspan_DEPENDENCY YES
 )
 ```
 
diff --git a/README.md b/README.md
index f73d474efc..c359a79e39 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
+RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
 By taking a primitives-based approach to algorithm development, RAFT 
 - accelerates algorithm construction time
diff --git a/build.sh b/build.sh
index 0c3fbaccb6..568de2956d 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -36,6 +36,8 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compile-libs   - compile shared libraries for all components
    --compile-nn     - compile shared library for nn component
    --compile-dist   - compile shared library for distance component
+   --minimal-deps   - disables dependencies like thrust so they can be overridden.
+                      can be useful for a pure header-only install
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --install        - install cmake targets
@@ -62,6 +64,9 @@ COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
+
+ENABLE_thrust_DEPENDENCY=ON
+
 ENABLE_ucx_DEPENDENCY=OFF
 ENABLE_nccl_DEPENDENCY=OFF
 
@@ -105,6 +110,11 @@ fi
 if hasArg --install; then
   INSTALL_TARGET="install"
 fi
+
+if hasArg --minimal-deps; then
+  ENABLE_thrust_DEPENDENCY=OFF
+fi
+
 if hasArg -v; then
     VERBOSE_FLAG="-v"
     CMAKE_LOG_LEVEL="VERBOSE"
@@ -218,7 +228,8 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
           -DRAFT_ENABLE_nccl_DEPENDENCY=${ENABLE_nccl_DEPENDENCY} \
-          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY}
+          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY} \
+          -DRAFT_ENABLE_thrust_DEPENDENCY=${ENABLE_thrust_DEPENDENCY}
 
   if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index ad5a278466..9b78bd15f3 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -44,7 +44,6 @@ requirements:
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version }}
   run:
     - libraft-headers {{ version }}
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index f239e545ef..d351b27577 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft --install -v --allgpuarch
+./build.sh libraft --install -v --allgpuarch
\ No newline at end of file
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
index ed8dc4373e..fd95da66ee 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -43,7 +43,6 @@ requirements:
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version}}
     - libcusolver>=11.2.1
   run:
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 8cedb15d09..fa3392ddc8 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -44,7 +44,6 @@ requirements:
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 35b066abc9..ab52b766e2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
+set(RAPIDS_VERSION "22.06")
+set(RAFT_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.06/RAPIDS.cmake
@@ -26,7 +28,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 22.06.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION ${RAFT_VERSION} LANGUAGES CXX CUDA)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
@@ -55,16 +57,22 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
-option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
+option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 
-option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
+option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ${BUILD_TESTS})
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
 
-option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
+option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
+option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
+
+if(BUILD_TESTS AND NOT RAFT_ENABLE_thrust_DEPENDENCY)
+  message(VERBOSE "RAFT: BUILD_TESTS is enabled, overriding RAFT_ENABLE_thrust_DEPENDENCY")
+  set(RAFT_ENABLE_thrust_DEPENDENCY ON)
+endif()
 
 # Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
 option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
@@ -75,6 +83,7 @@ option(RAFT_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' targ
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
+message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
@@ -123,6 +132,10 @@ include(cmake/modules/ConfigureCUDA.cmake)
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
+if(distance IN_LIST raft_FIND_COMPONENTS OR RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
+  set(RAFT_ENABLE_cuco_DEPENDENCY ON)
+endif()
+
 # add third party dependencies using CPM
 rapids_cpm_init()
 
@@ -151,8 +164,10 @@ target_include_directories(raft INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
+# Keep RAFT as lightweight as possible.
+# Only CUDA libs, rmm, and mdspan should
+# be used in global target.
 target_link_libraries(raft INTERFACE
-        raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         CUDA::cublas
         CUDA::curand
@@ -160,8 +175,9 @@ target_link_libraries(raft INTERFACE
         CUDA::cudart
         CUDA::cusparse
         rmm::rmm
-        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
-        std::mdspan)
+        $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
+        $<$<BOOL:${RAFT_ENABLE_mdspan_DEPENDENCY}>:std::mdspan>
+)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -248,6 +264,7 @@ endif()
 
 target_link_libraries(raft_distance INTERFACE
     raft::raft
+    $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -301,6 +318,7 @@ endif()
 
 target_link_libraries(raft_nn INTERFACE
     raft::raft
+    $<$<BOOL:${RAFT_ENABLE_NN_DEPENDENCIES}>:faiss::faiss>
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -341,6 +359,9 @@ install(DIRECTORY include/raft
 install(FILES include/raft.hpp
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/raft/version_config.hpp
+        DESTINATION include/raft)
+
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string
@@ -348,7 +369,7 @@ set(doc_string
 Provide targets for the RAFT: Reusable Accelerated Functions and Tools
 
 RAFT contains fundamental widely-used algorithms and primitives
-for data science, graph, and ml.
+for data science and machine learning.
 
 Optional Components:
   - nn
@@ -361,13 +382,18 @@ Imported Targets:
 
 ]=])
 
-set(code_string
-[=[
-
-if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
+set(code_string )
+if(RAFT_ENABLE_thrust_DEPENDENCY)
+  string(APPEND code_string
+  [=[
+  if(NOT TARGET raft::Thrust)
+    thrust_create_target(raft::Thrust FROM_OPTIONS)
+  endif()
+  ]=])
 endif()
 
+string(APPEND code_string
+[=[
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
@@ -381,8 +407,7 @@ if(nn IN_LIST raft_FIND_COMPONENTS)
       add_library(faiss ALIAS faiss::faiss)
   endif()
 endif()
-]=]
-        )
+]=])
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index a8951a3ee9..c35db4c962 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,21 +16,20 @@
 
 function(find_and_configure_cuco VERSION)
 
-    if(RAFT_ENABLE_cuco_DEPENDENCY)
-        rapids_cpm_find(cuco ${VERSION}
-          GLOBAL_TARGETS      cuco::cuco
-          BUILD_EXPORT_SET    raft-exports
-          INSTALL_EXPORT_SET  raft-exports
-          CPM_ARGS
-            GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-            GIT_TAG        fb58a38701f1c24ecfe07d8f1f208bbe80930da5
-            OPTIONS        "BUILD_TESTS OFF"
-                           "BUILD_BENCHMARKS OFF"
-                           "BUILD_EXAMPLES OFF"
-        )
-    endif()
-
+    rapids_cpm_find(cuco ${VERSION}
+      GLOBAL_TARGETS      cuco::cuco
+      BUILD_EXPORT_SET    raft-distance-exports
+      INSTALL_EXPORT_SET  raft-distance-exports
+      CPM_ARGS
+        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+        GIT_TAG        6ec8b6dcdeceea07ab4456d32461a05c18864411
+        OPTIONS        "BUILD_TESTS OFF"
+                       "BUILD_BENCHMARKS OFF"
+                       "BUILD_EXAMPLES OFF"
+    )
 endfunction()
 
-# cuCollections doesn't have a version yet
-find_and_configure_cuco(0.0.1)
+if(RAFT_ENABLE_cuco_DEPENDENCY)
+    # cuCollections doesn't have a version yet
+    find_and_configure_cuco(0.0.1)
+endif()
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index a018341b24..92d8e57de9 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -14,11 +14,13 @@
 
 # This function finds libcudacxx and sets any additional necessary environment variables.
 function(find_and_configure_libcudacxx)
+
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
   rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
                         INSTALL_EXPORT_SET raft-exports)
-
 endfunction()
 
-find_and_configure_libcudacxx()
+if(RAFT_ENABLE_cuco_DEPENDENCY)
+  find_and_configure_libcudacxx()
+endif()
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 12ac7ab0fd..5af3c4f31e 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -13,17 +13,19 @@
 # =============================================================================
 
 function(find_and_configure_mdspan VERSION)
-  rapids_cpm_find(
-    mdspan ${VERSION}
-    GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-exports
-    INSTALL_EXPORT_SET  raft-exports
-    CPM_ARGS
-      GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
-      GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
-      OPTIONS "MDSPAN_ENABLE_CUDA ON"
-              "MDSPAN_CXX_STANDARD ON"
-  )
+    rapids_cpm_find(
+      mdspan ${VERSION}
+      GLOBAL_TARGETS std::mdspan
+      BUILD_EXPORT_SET    raft-exports
+      INSTALL_EXPORT_SET  raft-exports
+      CPM_ARGS
+        GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
+        GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
+        OPTIONS "MDSPAN_ENABLE_CUDA ON"
+                "MDSPAN_CXX_STANDARD ON"
+    )
 endfunction()
 
-find_and_configure_mdspan(0.2.0)
+if(RAFT_ENABLE_mdspan_DEPENDENCY)
+  find_and_configure_mdspan(0.2.0)
+endif()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 03dfecde6a..12360b9482 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -14,11 +14,13 @@
 
 # Use CPM to find or clone thrust
 function(find_and_configure_thrust)
-    include(${rapids-cmake-dir}/cpm/thrust.cmake)
+        include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust( NAMESPACE raft
-                       BUILD_EXPORT_SET raft-exports
-                       INSTALL_EXPORT_SET raft-exports)
+        rapids_cpm_thrust( NAMESPACE raft
+                           BUILD_EXPORT_SET raft-exports
+                           INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
-find_and_configure_thrust()
+if(RAFT_ENABLE_thrust_DEPENDENCY)
+  find_and_configure_thrust()
+endif()
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
index 9066e103d0..77483e577d 100644
--- a/cpp/include/raft/common/logger.hpp
+++ b/cpp/include/raft/common/logger.hpp
@@ -13,286 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <stdarg.h>
-
-#include <algorithm>
-
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include <stdarg.h>
-
-#define SPDLOG_HEADER_ONLY
-#include <raft/common/detail/callback_sink.hpp>
-#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
-#include <spdlog/spdlog.h>                    // NOLINT
 
 /**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
+ * This file is deprecated and will be removed in release 22.08.
+ * Please use the include/core/logger.hpp instead.
  */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
-#endif
 
-namespace raft {
-
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
-/**
- * @defgroup CStringFormat Expand a C-style format string
- *
- * @brief Expands C-style formatted string into std::string
- *
- * @param[in] fmt format string
- * @param[in] vl  respective values for each of format modifiers in the string
- *
- * @return the expanded `std::string`
- *
- * @{
- */
-std::string format(const char* fmt, va_list& vl)
-{
-  char buf[4096];
-  vsnprintf(buf, sizeof(buf), fmt, vl);
-  return std::string(buf);
-}
-
-std::string format(const char* fmt, ...)
-{
-  va_list vl;
-  va_start(vl, fmt);
-  std::string str = format(fmt, vl);
-  va_end(vl);
-  return str;
-}
-/** @} */
-
-int convert_level_to_spdlog(int level)
-{
-  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-  return RAFT_LEVEL_TRACE - level;
-}
-
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-    set_pattern(default_log_pattern);
-    set_level(RAFT_LEVEL_INFO);
-  }
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "")
-  {
-    if (log_map.find(name) == log_map.end()) {
-      log_map[name] = std::make_shared<raft::logger>(name);
-    }
-    return *log_map[name];
-  }
-
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level)
-  {
-    level = convert_level_to_spdlog(level);
-    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-  }
-
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern)
-  {
-    cur_pattern = pattern;
-    spdlogger->set_pattern(pattern);
-  }
-
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
-
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)()) { sink->set_flush(flush); }
-
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const
-  {
-    level        = convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    return spdlogger->should_log(level_e);
-  }
-
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const
-  {
-    auto level_e = spdlogger->level();
-    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-  }
-
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const { return cur_pattern; }
-
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...)
-  {
-    level        = convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    // explicit check to make sure that we only expand messages when required
-    if (spdlogger->should_log(level_e)) {
-      va_list vl;
-      va_start(vl, fmt);
-      auto msg = format(fmt, vl);
-      va_end(vl);
-      spdlogger->log(level_e, msg);
-    }
-  }
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush() { spdlogger->flush(); }
-
-  ~logger() {}
-
- private:
-  logger();
-
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-};  // class logger
-
-};  // namespace raft
-
-/**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
- */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::format("%s:%d ", __FILE__, __LINE__);                     \
-    ss << raft::format(fmt, ##__VA_ARGS__);                               \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
+#pragma once
 
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
+#include <raft/core/logger.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index 918d5e10d8..385bc544b0 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,142 +14,11 @@
  * limitations under the License.
  */
 
-#pragma once
-
-#include "detail/nvtx.hpp"
-#include <optional>
-
-/**
- * \section Usage
- *
- * To add NVTX ranges to your code, use the `nvtx::range` RAII object. A
- * range begins when the object is created, and ends when the object is
- * destroyed.
- *
- * The example below creates nested NVTX ranges. The range `fun_scope` spans
- * the whole function, while the range `epoch_scope` spans an iteration
- * (and appears 5 times in the timeline).
- * \code{.cpp}
- * #include <raft/common/nvtx.hpp>
- * void some_function(int k){
- *   // Begins a NVTX range with the messsage "some_function_{k}"
- *   // The range ends when some_function() returns
- *   common::nvtx::range fun_scope( r{"some_function_%d", k};
- *
- *   for(int i = 0; i < 5; i++){
- *     common::nvtx::range epoch_scope{"epoch-%d", i};
- *     // some logic inside the loop
- *   }
- * }
- * \endcode
- *
- * \section Domains
- *
- * All NVTX ranges are assigned to domains. A domain defines a named timeline in
- * the Nsight Systems view. By default, we put all ranges into a domain `domain::app`
- * named "application". This is controlled by the template parameter `Domain`.
- *
- * The example below defines a domain and uses it in a function.
- * \code{.cpp}
- * #include <raft/common/nvtx.hpp>
- *
- * struct my_app_domain {
- *   static constexpr char const* name{"my application"};
- * }
- *
- * void some_function(int k){
- *   // This NVTX range appears in the timeline named "my application" in Nsight Systems.
- *   common::nvtx::range<my_app_domain> fun_scope( r{"some_function_%d", k};
- *   // some logic inside the loop
- * }
- * \endcode
- */
-namespace raft::common::nvtx {
-
-namespace domain {
-
-/** @brief The default NVTX domain. */
-struct app {
-  static constexpr char const* name{"application"};
-};
-
-/** @brief This NVTX domain is supposed to be used within raft.  */
-struct raft {
-  static constexpr char const* name{"raft"};
-};
-
-}  // namespace domain
-
-/**
- * @brief Push a named NVTX range.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- *   NB: make sure to use the same domain for `push_range` and `pop_range`.
- * @param format range name format (accepts printf-style arguments)
- * @param args the arguments for the printf-style formatting
- */
-template <typename Domain = domain::app, typename... Args>
-inline void push_range(const char* format, Args... args)
-{
-  detail::push_range<Domain, Args...>(format, args...);
-}
-
 /**
- * @brief Pop the latest range.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ * This file is deprecated and will be removed in release 22.08.
+ * Please use the include/core/nvtx.hpp instead.
  */
-template <typename Domain = domain::app>
-inline void pop_range()
-{
-  detail::pop_range<Domain>();
-}
 
-/**
- * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
- *
- * Refer to \ref Usage for the usage examples.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- */
-template <typename Domain = domain::app>
-class range {
- public:
-  /**
-   * Push a named NVTX range.
-   * At the end of the object lifetime, pop the range back.
-   *
-   * @param format range name format (accepts printf-style arguments)
-   * @param args the arguments for the printf-style formatting
-   */
-  template <typename... Args>
-  explicit range(const char* format, Args... args)
-  {
-    push_range<Domain, Args...>(format, args...);
-  }
-
-  ~range() { pop_range<Domain>(); }
-
-  /* This object is not meant to be touched. */
-  range(const range&) = delete;
-  range(range&&)      = delete;
-  auto operator=(const range&) -> range& = delete;
-  auto operator=(range&&) -> range&                = delete;
-  static auto operator new(std::size_t) -> void*   = delete;
-  static auto operator new[](std::size_t) -> void* = delete;
-};
+#pragma once
 
-}  // namespace raft::common::nvtx
+#include <raft/core/nvtx.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 9fb2b5a2c6..2ab0f053fc 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -13,631 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/comms.hpp instead.
+ * Please use core/comms.hpp instead.
  */
 
-#ifndef __RAFT_RT_COMMS_H
-#define __RAFT_RT_COMMS_H
-
 #pragma once
 
-#include <memory>
-#include <raft/error.hpp>
-#include <vector>
-
-namespace raft {
-namespace comms {
-
-typedef unsigned int request_t;
-enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-enum class op_t { SUM, PROD, MIN, MAX };
-
-/**
- * The resulting status of distributed stream synchronization
- */
-enum class status_t {
-  SUCCESS,  // Synchronization successful
-  ERROR,    // An error occured querying sync status
-  ABORT     // A failure occurred in sync, queued operations aborted
-};
-
-template <typename value_t>
-constexpr datatype_t
-
-get_type();
-
-template <>
-constexpr datatype_t
-
-get_type<char>()
-{
-  return datatype_t::CHAR;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint8_t>()
-{
-  return datatype_t::UINT8;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int>()
-{
-  return datatype_t::INT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint32_t>()
-{
-  return datatype_t::UINT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int64_t>()
-{
-  return datatype_t::INT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint64_t>()
-{
-  return datatype_t::UINT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<float>()
-{
-  return datatype_t::FLOAT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<double>()
-{
-  return datatype_t::FLOAT64;
-}
-
-class comms_iface {
- public:
-  virtual ~comms_iface() {}
-
-  virtual int get_size() const = 0;
-
-  virtual int get_rank() const = 0;
-
-  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-
-  virtual void barrier() const = 0;
-
-  virtual status_t sync_stream(cudaStream_t stream) const = 0;
-
-  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
-
-  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
-
-  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
-
-  virtual void allreduce(const void* sendbuff,
-                         void* recvbuff,
-                         size_t count,
-                         datatype_t datatype,
-                         op_t op,
-                         cudaStream_t stream) const = 0;
-
-  virtual void bcast(
-    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
-
-  virtual void bcast(const void* sendbuff,
-                     void* recvbuff,
-                     size_t count,
-                     datatype_t datatype,
-                     int root,
-                     cudaStream_t stream) const = 0;
-
-  virtual void reduce(const void* sendbuff,
-                      void* recvbuff,
-                      size_t count,
-                      datatype_t datatype,
-                      op_t op,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void allgather(const void* sendbuff,
-                         void* recvbuff,
-                         size_t sendcount,
-                         datatype_t datatype,
-                         cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf,
-                          void* recvbuf,
-                          const size_t* recvcounts,
-                          const size_t* displs,
-                          datatype_t datatype,
-                          cudaStream_t stream) const = 0;
-
-  virtual void gather(const void* sendbuff,
-                      void* recvbuff,
-                      size_t sendcount,
-                      datatype_t datatype,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void gatherv(const void* sendbuf,
-                       void* recvbuf,
-                       size_t sendcount,
-                       const size_t* recvcounts,
-                       const size_t* displs,
-                       datatype_t datatype,
-                       int root,
-                       cudaStream_t stream) const = 0;
-
-  virtual void reducescatter(const void* sendbuff,
-                             void* recvbuff,
-                             size_t recvcount,
-                             datatype_t datatype,
-                             op_t op,
-                             cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf,
-                               size_t sendsize,
-                               int dest,
-                               void* recvbuf,
-                               size_t recvsize,
-                               int source,
-                               cudaStream_t stream) const = 0;
-
-  virtual void device_multicast_sendrecv(const void* sendbuf,
-                                         std::vector<size_t> const& sendsizes,
-                                         std::vector<size_t> const& sendoffsets,
-                                         std::vector<int> const& dests,
-                                         void* recvbuf,
-                                         std::vector<size_t> const& recvsizes,
-                                         std::vector<size_t> const& recvoffsets,
-                                         std::vector<int> const& sources,
-                                         cudaStream_t stream) const = 0;
-};
-
-class comms_t {
- public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-  {
-    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-  }
-
-  /**
-   * Virtual Destructor to enable polymorphism
-   */
-  virtual ~comms_t() {}
-
-  /**
-   * Returns the size of the communicator clique
-   */
-
-  int get_size() const { return impl_->get_size(); }
-
-  /**
-   * Returns the local rank
-   */
-  int get_rank() const { return impl_->get_rank(); }
-
-  /**
-   * Splits the current communicator clique into sub-cliques matching
-   * the given color and key
-   *
-   * @param color ranks w/ the same color are placed in the same communicator
-   * @param key controls rank assignment
-   */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    return impl_->comm_split(color, key);
-  }
-
-  /**
-   * Performs a collective barrier synchronization
-   */
-  void barrier() const { impl_->barrier(); }
-
-  /**
-   * Some collective communications implementations (eg. NCCL) might use asynchronous
-   * collectives that are explicitly synchronized. It's important to always synchronize
-   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-   * to prevent the potential for deadlocks.
-   *
-   * @param stream the cuda stream to sync collective operations on
-   */
-  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-  /**
-   * Performs an asynchronous point-to-point send
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param tag a tag to use for the receiver to filter
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-  }
-
-  /**
-   * Performs an asynchronous point-to-point receive
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param tag a tag to use for message filtering
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-  }
-
-  /**
-   * Synchronize on an array of request_t objects returned from isend/irecv
-   * @param count number of requests to synchronize on
-   * @param array_of_requests an array of request_t objects returned from isend/irecv
-   */
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    impl_->waitall(count, array_of_requests);
-  }
-
-  /**
-   * Perform an allreduce collective
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff data to reduce
-   * @param recvbuff buffer to hold the reduced result
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allreduce(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-  {
-    impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     count,
-                     get_type<value_t>(),
-                     op,
-                     stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param buff buffer to send
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to broadcast (only used in root)
-   * @param recvbuff buffer to receive broadcasted data
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff),
-                 count,
-                 get_type<value_t>(),
-                 root,
-                 stream);
-  }
-
-  /**
-   * Reduce data from many ranks down to a single rank
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to reduce
-   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reduce(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t count,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  count,
-                  get_type<value_t>(),
-                  op,
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgather(const value_t* sendbuff,
-                 value_t* recvbuff,
-                 size_t sendcount,
-                 cudaStream_t stream) const
-  {
-    impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     sendcount,
-                     get_type<value_t>(),
-                     stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgatherv(const value_t* sendbuf,
-                  value_t* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  cudaStream_t stream) const
-  {
-    impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf),
-                      recvcounts,
-                      displs,
-                      get_type<value_t>(),
-                      stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gather(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t sendcount,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  sendcount,
-                  get_type<value_t>(),
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param sendcount number of elements in send buffer
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gatherv(const value_t* sendbuf,
-               value_t* recvbuf,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               int root,
-               cudaStream_t stream) const
-  {
-    impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf),
-                   sendcount,
-                   recvcounts,
-                   displs,
-                   get_type<value_t>(),
-                   root,
-                   stream);
-  }
-
-  /**
-   * Reduces data from all ranks then scatters the result across ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-   * @param recvbuff buffer containing received data
-   * @param recvcount number of items to receive
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reducescatter(const value_t* sendbuff,
-                     value_t* recvbuff,
-                     size_t recvcount,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff),
-                         recvcount,
-                         get_type<value_t>(),
-                         op,
-                         stream);
-  }
-
-  /**
-   * Performs a point-to-point send
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-  }
-
-  /**
-   * Performs a point-to-point receive
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-  }
-
-  /**
-   * Performs a point-to-point send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsize number of elements in sendbuf
-   * @param dest destination rank
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsize number of elements in recvbuf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       value_t* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                           sendsize * sizeof(value_t),
-                           dest,
-                           static_cast<void*>(recvbuf),
-                           recvsize * sizeof(value_t),
-                           source,
-                           stream);
-  }
-
-  /**
-   * Performs a multicast send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsizes numbers of elements to send
-   * @param sendoffsets offsets in a number of elements from sendbuf
-   * @param dests destination ranks
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsizes numbers of elements to recv
-   * @param recvoffsets offsets in a number of elements from recvbuf
-   * @param sources source ranks
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_multicast_sendrecv(const value_t* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 value_t* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    auto sendbytesizes   = sendsizes;
-    auto sendbyteoffsets = sendoffsets;
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      sendbytesizes[i] *= sizeof(value_t);
-      sendbyteoffsets[i] *= sizeof(value_t);
-    }
-    auto recvbytesizes   = recvsizes;
-    auto recvbyteoffsets = recvoffsets;
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      recvbytesizes[i] *= sizeof(value_t);
-      recvbyteoffsets[i] *= sizeof(value_t);
-    }
-    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes,
-                                     sendbyteoffsets,
-                                     dests,
-                                     static_cast<void*>(recvbuf),
-                                     recvbytesizes,
-                                     recvbyteoffsets,
-                                     sources,
-                                     stream);
-  }
-
- private:
-  std::unique_ptr<comms_iface> impl_;
-};
-
-}  // namespace comms
-}  // namespace raft
-
-#endif
+#include <raft/core/comms.hpp>
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
new file mode 100644
index 0000000000..bf2f7af777
--- /dev/null
+++ b/cpp/include/raft/core/comms.hpp
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <raft/error.hpp>
+#include <vector>
+
+namespace raft {
+namespace comms {
+
+typedef unsigned int request_t;
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+enum class op_t { SUM, PROD, MIN, MAX };
+
+/**
+ * The resulting status of distributed stream synchronization
+ */
+enum class status_t {
+  SUCCESS,  // Synchronization successful
+  ERROR,    // An error occured querying sync status
+  ABORT     // A failure occurred in sync, queued operations aborted
+};
+
+template <typename value_t>
+constexpr datatype_t
+
+get_type();
+
+template <>
+constexpr datatype_t
+
+get_type<char>()
+{
+  return datatype_t::CHAR;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<uint8_t>()
+{
+  return datatype_t::UINT8;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<int>()
+{
+  return datatype_t::INT32;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<uint32_t>()
+{
+  return datatype_t::UINT32;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<int64_t>()
+{
+  return datatype_t::INT64;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<uint64_t>()
+{
+  return datatype_t::UINT64;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<float>()
+{
+  return datatype_t::FLOAT32;
+}
+
+template <>
+constexpr datatype_t
+
+get_type<double>()
+{
+  return datatype_t::FLOAT64;
+}
+
+class comms_iface {
+ public:
+  virtual ~comms_iface() {}
+
+  virtual int get_size() const = 0;
+
+  virtual int get_rank() const = 0;
+
+  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+
+  virtual void barrier() const = 0;
+
+  virtual status_t sync_stream(cudaStream_t stream) const = 0;
+
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+
+  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
+                         cudaStream_t stream) const = 0;
+
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
+                     cudaStream_t stream) const = 0;
+
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
+
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
+                       cudaStream_t stream) const = 0;
+
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
+                             cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
+                               cudaStream_t stream) const = 0;
+
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
+                                         void* recvbuf,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
+                                         cudaStream_t stream) const = 0;
+};
+
+class comms_t {
+ public:
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
+    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+  }
+
+  /**
+   * Virtual Destructor to enable polymorphism
+   */
+  virtual ~comms_t() {}
+
+  /**
+   * Returns the size of the communicator clique
+   */
+
+  int get_size() const { return impl_->get_size(); }
+
+  /**
+   * Returns the local rank
+   */
+  int get_rank() const { return impl_->get_rank(); }
+
+  /**
+   * Splits the current communicator clique into sub-cliques matching
+   * the given color and key
+   *
+   * @param color ranks w/ the same color are placed in the same communicator
+   * @param key controls rank assignment
+   */
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return impl_->comm_split(color, key);
+  }
+
+  /**
+   * Performs a collective barrier synchronization
+   */
+  void barrier() const { impl_->barrier(); }
+
+  /**
+   * Some collective communications implementations (eg. NCCL) might use asynchronous
+   * collectives that are explicitly synchronized. It's important to always synchronize
+   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+   * to prevent the potential for deadlocks.
+   *
+   * @param stream the cuda stream to sync collective operations on
+   */
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+  /**
+   * Performs an asynchronous point-to-point send
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param tag a tag to use for the receiver to filter
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+  }
+
+  /**
+   * Performs an asynchronous point-to-point receive
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param tag a tag to use for message filtering
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+  }
+
+  /**
+   * Synchronize on an array of request_t objects returned from isend/irecv
+   * @param count number of requests to synchronize on
+   * @param array_of_requests an array of request_t objects returned from isend/irecv
+   */
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    impl_->waitall(count, array_of_requests);
+  }
+
+  /**
+   * Perform an allreduce collective
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff data to reduce
+   * @param recvbuff buffer to hold the reduced result
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
+    impl_->allreduce(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param buff buffer to send
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to broadcast (only used in root)
+   * @param recvbuff buffer to receive broadcasted data
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<const void*>(sendbuff),
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
+                 stream);
+  }
+
+  /**
+   * Reduce data from many ranks down to a single rank
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to reduce
+   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->reduce(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
+    impl_->allgather(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
+    impl_->allgatherv(static_cast<const void*>(sendbuf),
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->gather(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param sendcount number of elements in send buffer
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
+    impl_->gatherv(static_cast<const void*>(sendbuf),
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
+  }
+
+  /**
+   * Reduces data from all ranks then scatters the result across ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+   * @param recvbuff buffer containing received data
+   * @param recvcount number of items to receive
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    impl_->reducescatter(static_cast<const void*>(sendbuff),
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
+  }
+
+  /**
+   * Performs a point-to-point send
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+  }
+
+  /**
+   * Performs a point-to-point receive
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+  }
+
+  /**
+   * Performs a point-to-point send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsize number of elements in sendbuf
+   * @param dest destination rank
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsize number of elements in recvbuf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
+  }
+
+  /**
+   * Performs a multicast send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsizes numbers of elements to send
+   * @param sendoffsets offsets in a number of elements from sendbuf
+   * @param dests destination ranks
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsizes numbers of elements to recv
+   * @param recvoffsets offsets in a number of elements from recvbuf
+   * @param sources source ranks
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 value_t* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    auto sendbytesizes   = sendsizes;
+    auto sendbyteoffsets = sendoffsets;
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      sendbytesizes[i] *= sizeof(value_t);
+      sendbyteoffsets[i] *= sizeof(value_t);
+    }
+    auto recvbytesizes   = recvsizes;
+    auto recvbyteoffsets = recvoffsets;
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      recvbytesizes[i] *= sizeof(value_t);
+      recvbyteoffsets[i] *= sizeof(value_t);
+    }
+    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
+  }
+
+ private:
+  std::unique_ptr<comms_iface> impl_;
+};
+
+}  // namespace comms
+}  // namespace raft
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
new file mode 100644
index 0000000000..f5de57677d
--- /dev/null
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUBLAS_MACROS_H
+#define __RAFT_RT_CUBLAS_MACROS_H
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/error.hpp>
+
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+
+#include <cstdint>
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err: return #err
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuBLAS error is encountered.
+ */
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUBLAS_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuBLAS runtime API functions.
+ *
+ * Invokes a cuBLAS runtime API function call, if the call does not return
+ * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
+ */
+#define RAFT_CUBLAS_TRY(call)                                              \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
new file mode 100644
index 0000000000..5adc0227a8
--- /dev/null
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cudart_utils.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUDART_UTILS_H
+#define __RAFT_RT_CUDART_UTILS_H
+
+#pragma once
+
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda_runtime.h>
+
+#include <chrono>
+#include <cstdio>
+#include <execinfo.h>
+#include <iomanip>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+
+///@todo: enable once logging has been enabled in raft
+//#include "logger.hpp"
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ */
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
+};
+
+}  // namespace raft
+
+/**
+ * @brief Error checking macro for CUDA runtime API functions.
+ *
+ * Invokes a CUDA runtime API function call, if the call does not return
+ * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
+ * exception detailing the CUDA error that occurred
+ *
+ */
+#define RAFT_CUDA_TRY(call)                        \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUDA_TRY
+#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
+#endif
+
+/**
+ * @brief Debug macro to check for CUDA errors
+ *
+ * In a non-release build, this macro will synchronize the specified stream
+ * before error checking. In both release and non-release builds, this macro
+ * checks for any pending CUDA errors from previous calls. If an error is
+ * reported, an exception is thrown detailing the CUDA error that occurred.
+ *
+ * The intent of this macro is to provide a mechanism for synchronous and
+ * deterministic execution for debugging asynchronous CUDA execution. It should
+ * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
+ * asynchronous kernel launch.
+ */
+#ifndef NDEBUG
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#else
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
+#endif
+
+// FIXME: Remove after consumers rename
+#ifndef CHECK_CUDA
+#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUDA_TRY_NO_THROW(call)                               \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after cuml rename
+#ifndef CUDA_CHECK_NO_THROW
+#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
+#endif
+
+/**
+ * Alias to raft scope for now.
+ * TODO: Rename original implementations in 22.04 to fix
+ * https://github.com/rapidsai/raft/issues/128
+ */
+
+namespace raft {
+
+/** Helper method to get to know warp size in device code */
+__host__ __device__ constexpr inline int warp_size() { return 32; }
+
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to threads.
+ */
+class grid_1d_thread_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
+    : block_size(num_threads_per_block),
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+  }
+};
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to warps.
+ */
+class grid_1d_warp_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
+                 size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to blocks.
+ */
+class grid_1d_block_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
+                  size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
+
+/**
+ * @brief Generic copy method for all kinds of transfers
+ * @tparam Type data type
+ * @param dst destination pointer
+ * @param src source pointer
+ * @param len lenth of the src/dst buffers in terms of number of elements
+ * @param stream cuda stream
+ */
+template <typename Type>
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+}
+
+/**
+ * @defgroup Copy Copy methods
+ * These are here along with the generic 'copy' method in order to improve
+ * code readability using explicitly specified function names
+ * @{
+ */
+/** performs a host to device copy */
+template <typename Type>
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(d_ptr, h_ptr, len, stream);
+}
+
+/** performs a device to host copy */
+template <typename Type>
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(h_ptr, d_ptr, len, stream);
+}
+
+template <typename Type>
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+}
+/** @} */
+
+/**
+ * @defgroup Debug Utils for debugging host/device buffers
+ * @{
+ */
+template <class T, class OutStream>
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
+  out << variable_name << "=[";
+  for (size_t i = 0; i < componentsCount; ++i) {
+    if (i != 0) out << ",";
+    out << host_mem[i];
+  }
+  out << "];\n";
+}
+
+template <class T, class OutStream>
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
+}
+/** @} */
+
+/** helper method to get max usable shared mem per block parameter */
+inline int getSharedMemPerBlock()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int smemPerBlk;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  return smemPerBlk;
+}
+
+/** helper method to get multi-processor count parameter */
+inline int getMultiProcessorCount()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int mpCount;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  return mpCount;
+}
+
+/** helper method to convert an array on device to a string on host */
+template <typename T>
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
+  std::stringstream ss;
+
+  T* arr_h = (T*)malloc(size * sizeof(T));
+  update_host(arr_h, arr, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+  ss << name << " = [ ";
+  for (int i = 0; i < size; i++) {
+    ss << std::setw(width) << arr_h[i];
+
+    if (i < size - 1) ss << ", ";
+  }
+  ss << " ]" << std::endl;
+
+  free(arr_h);
+
+  return ss.str();
+}
+
+/** this seems to be unused, but may be useful in the future */
+template <typename T>
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
+  cudaPointerAttributes s_att;
+  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+  if (s_err != 0 || s_att.device == -1)
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
+}
+
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+}
+
+/** Helper function to calculate need memory for allocate to store dense matrix.
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+
+/** Helper function to check alignment of pointer.
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
+template <typename Type>
+bool is_aligned(Type* ptr, size_t alignment)
+{
+  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+/** calculate greatest common divisor of two numbers
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
+template <typename IntType>
+IntType gcd(IntType a, IntType b)
+{
+  while (b != 0) {
+    IntType tmp = b;
+    b           = a % b;
+    a           = tmp;
+  }
+  return a;
+}
+
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
new file mode 100644
index 0000000000..b41927f5fb
--- /dev/null
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUSOLVER_MACROS_H
+#define __RAFT_RT_CUSOLVER_MACROS_H
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <type_traits>
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err: return #err;
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSOLVER error is encountered.
+ */
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUSOLVER_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSOLVER runtime API functions.
+ *
+ * Invokes a cuSOLVER runtime API function call, if the call does not return
+ * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
+ */
+#define RAFT_CUSOLVER_TRY(call)                                              \
+  do {                                                                       \
+    cusolverStatus_t const status = (call);                                  \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::linalg::detail::cusolver_error_to_string(status)); \
+      throw raft::cusolver_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+#endif
+
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
new file mode 100644
index 0000000000..10c7e8836c
--- /dev/null
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse.h>
+#include <raft/error.hpp>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+
+#define _CUSPARSE_ERR_TO_STR(err) \
+  case err: return #err;
+
+// Notes:
+//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
+//(2.) to enforce a lower version,
+//
+//`#define CUDA_ENFORCE_LOWER
+// #include <raft/sparse/detail/cusparse_wrappers.h>`
+//
+// (i.e., before including this header)
+//
+#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSparse error is encountered.
+ */
+struct cusparse_error : public raft::exception {
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace sparse {
+namespace detail {
+
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+  return cusparseGetErrorString(err);
+#else   // CUDART_VERSION
+  switch (err) {
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    default: return "CUSPARSE_STATUS_UNKNOWN";
+  };
+#endif  // CUDART_VERSION
+}
+
+}  // namespace detail
+}  // namespace sparse
+}  // namespace raft
+
+#undef _CUSPARSE_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSparse runtime API functions.
+ *
+ * Invokes a cuSparse runtime API function call, if the call does not return
+ * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ */
+#define RAFT_CUSPARSE_TRY(call)                                              \
+  do {                                                                       \
+    cusparseStatus_t const status = (call);                                  \
+    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::sparse::detail::cusparse_error_to_string(status)); \
+      throw raft::cusparse_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_TRY
+#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
+#endif
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK
+#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
+#endif
+
+//@todo: use logger here once logging is enabled
+/** check for cusparse runtime API errors but do not assert */
+#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK_NO_THROW
+#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
new file mode 100644
index 0000000000..a65b9a8469
--- /dev/null
+++ b/cpp/include/raft/core/error.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_ERROR
+#define __RAFT_RT_ERROR
+
+#pragma once
+
+#include <cstdio>
+#include <execinfo.h>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace raft {
+
+/** base exception class for the whole of raft */
+class exception : public std::exception {
+ public:
+  /** default ctor */
+  explicit exception() noexcept : std::exception(), msg_() {}
+
+  /** copy ctor */
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
+    collect_call_stack();
+  }
+
+  /** ctor from an input message */
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
+    collect_call_stack();
+  }
+
+  /** get the message associated with this exception */
+  char const* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  /** message associated with this exception */
+  std::string msg_;
+
+  /** append call stack info to this exception's message for ease of debug */
+  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+  void collect_call_stack() noexcept
+  {
+#ifdef __GNUC__
+    constexpr int kMaxStackDepth = 64;
+    void* stack[kMaxStackDepth];  // NOLINT
+    auto depth = backtrace(stack, kMaxStackDepth);
+    std::ostringstream oss;
+    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
+    char** strings = backtrace_symbols(stack, depth);
+    if (strings == nullptr) {
+      oss << "But no stack trace could be found!" << std::endl;
+      msg_ += oss.str();
+      return;
+    }
+    ///@todo: support for demangling of C++ symbol names
+    for (int i = 0; i < depth; ++i) {
+      oss << "#" << i << " in " << strings[i] << std::endl;
+    }
+    free(strings);
+    msg_ += oss.str();
+#endif  // __GNUC__
+  }
+};
+
+/**
+ * @brief Exception thrown when logical precondition is violated.
+ *
+ * This exception should not be thrown directly and is instead thrown by the
+ * RAFT_EXPECTS and  RAFT_FAIL macros.
+ *
+ */
+struct logic_error : public raft::exception {
+  explicit logic_error(char const* const message) : raft::exception(message) {}
+  explicit logic_error(std::string const& message) : raft::exception(message) {}
+};
+
+}  // namespace raft
+
+// FIXME: Need to be replaced with RAFT_FAIL
+/** macro to throw a runtime error */
+#define THROW(fmt, ...)                                                                      \
+  do {                                                                                       \
+    int size1 =                                                                              \
+      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
+    if (size1 < 0 || size2 < 0)                                                              \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
+    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
+    std::snprintf(buf.get(),                                                                 \
+                  size1 + 1 /* +1 for '\0' */,                                               \
+                  "exception occured! file=%s line=%d: ",                                    \
+                  __FILE__,                                                                  \
+                  __LINE__);                                                                 \
+    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
+    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+    throw raft::exception(msg);                                                              \
+  } while (0)
+
+// FIXME: Need to be replaced with RAFT_EXPECTS
+/** macro to check for a conditional and assert on failure */
+#define ASSERT(check, fmt, ...)              \
+  do {                                       \
+    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
+  } while (0)
+
+/**
+ * Macro to append error message to first argument.
+ * This should only be called in contexts where it is OK to throw exceptions!
+ */
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
+  do {                                                                                          \
+    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
+    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
+    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
+    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
+    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
+    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
+    std::snprintf(                                                                              \
+      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
+    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
+    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+  } while (0)
+
+/**
+ * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
+ *
+ * @param[in] cond Expression that evaluates to true or false
+ * @param[in] fmt String literal description of the reason that cond is expected to be true with
+ * optinal format tagas
+ * @throw raft::logic_error if the condition evaluates to false.
+ */
+#define RAFT_EXPECTS(cond, fmt, ...)                              \
+  do {                                                            \
+    if (!(cond)) {                                                \
+      std::string msg{};                                          \
+      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+      throw raft::logic_error(msg);                               \
+    }                                                             \
+  } while (0)
+
+/**
+ * @brief Indicates that an erroneous code path has been taken.
+ *
+ * @param[in] fmt String literal description of the reason that this code path is erroneous with
+ * optinal format tagas
+ * @throw always throws raft::logic_error
+ */
+#define RAFT_FAIL(fmt, ...)                                     \
+  do {                                                          \
+    std::string msg{};                                          \
+    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+    throw raft::logic_error(msg);                               \
+  } while (0)
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
new file mode 100644
index 0000000000..08cb812bb7
--- /dev/null
+++ b/cpp/include/raft/core/handle.hpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_HANDLE
+#define __RAFT_RT_HANDLE
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+///@todo: enable once we have migrated cuml-comms layer too
+//#include <common/cuml_comms_int.hpp>
+
+#include <raft/core/cudart_utils.hpp>
+
+#include <raft/core/comms.hpp>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/cusparse_macros.hpp>
+#include <raft/core/interruptible.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace raft {
+
+/**
+ * @brief Main handle object that stores all necessary context used for calling
+ *        necessary cuda kernels and/or libraries
+ */
+class handle_t {
+ public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&)                 = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
+  /**
+   * @brief Construct a handle with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   */
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }()),
+      stream_view_{stream_view},
+      stream_pool_{stream_pool}
+  {
+    create_resources();
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~handle_t() { destroy_resources(); }
+
+  int get_device() const { return dev_id_; }
+
+  cublasHandle_t get_cublas_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cublas_initialized_) {
+      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+      cublas_initialized_ = true;
+    }
+    return cublas_handle_;
+  }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+      cusolver_dn_initialized_ = true;
+    }
+    return cusolver_dn_handle_;
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+      cusolver_sp_initialized_ = true;
+    }
+    return cusolver_sp_handle_;
+  }
+
+  cusparseHandle_t get_cusparse_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusparse_initialized_) {
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+      cusparse_initialized_ = true;
+    }
+    return cusparse_handle_;
+  }
+
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+  /**
+   * @brief synchronize a stream on the handle
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { sync_stream(stream_view_); }
+
+  /**
+   * @brief returns main stream on the handle
+   */
+  rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns whether stream pool was initialized on the handle
+   */
+
+  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+  /**
+   * @brief returns stream pool on the handle
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return *stream_pool_;
+  }
+
+  std::size_t get_stream_pool_size() const
+  {
+    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+  }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream();
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream(stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on handle
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on handle
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+  }
+
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const
+  {
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      sync_stream(stream_pool_->get_stream(i));
+    }
+  }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    for (const auto& stream_index : stream_indices) {
+      sync_stream(stream_pool_->get_stream(stream_index));
+    }
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+    }
+  }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+    return *communicator_;
+  }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    subcomms_[key] = subcomm;
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+    auto subcomm = subcomms_.at(key);
+
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+    return *subcomm;
+  }
+
+  bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!device_prop_initialized_) {
+      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+      device_prop_initialized_ = true;
+    }
+    return prop_;
+  }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+  const int dev_id_;
+  mutable cublasHandle_t cublas_handle_;
+  mutable bool cublas_initialized_{false};
+  mutable cusolverDnHandle_t cusolver_dn_handle_;
+  mutable bool cusolver_dn_initialized_{false};
+  mutable cusolverSpHandle_t cusolver_sp_handle_;
+  mutable bool cusolver_sp_initialized_{false};
+  mutable cusparseHandle_t cusparse_handle_;
+  mutable bool cusparse_initialized_{false};
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+  cudaEvent_t event_;
+  mutable cudaDeviceProp prop_;
+  mutable bool device_prop_initialized_{false};
+  mutable std::mutex mutex_;
+
+  void create_resources()
+  {
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+
+  void destroy_resources()
+  {
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+  }
+};  // class handle_t
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the handle
+ */
+class stream_syncer {
+ public:
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const handle_t& handle_;
+};  // class stream_syncer
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp
new file mode 100644
index 0000000000..55d272739f
--- /dev/null
+++ b/cpp/include/raft/core/interruptible.hpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_INTERRUPTIBLE_H
+#define __RAFT_RT_INTERRUPTIBLE_H
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <raft/cudart_utils.h>
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <thread>
+#include <unordered_map>
+
+namespace raft {
+
+/**
+ * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
+ * to cancel the work performed in this CPU thread.
+ */
+struct interrupted_exception : public raft::exception {
+  using raft::exception::exception;
+};
+
+/**
+ * @brief Cooperative-style interruptible execution.
+ *
+ * This class provides facilities for interrupting execution of a C++ thread at designated points
+ * in code from outside of the thread. In particular, it provides an interruptible version of the
+ * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
+ *
+ *
+ * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
+ * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
+ * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
+ * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
+ * responsibility of the developer then to make sure the unfinished stream work does not affect the
+ * program in an undesirable way.
+ *
+ *
+ * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
+ * `interrupted_exception` immediately, you can safely wait on the stream again.
+ * Otherwise, some of the allocated resources may be released before the active kernel finishes
+ * using them, which will result in writing into deallocated or reallocated memory and undefined
+ * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
+ * practice, the outcome is usually acceptable for the use case of emergency program interruption
+ * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
+ * resuming of the GPU stream work.
+ */
+class interruptible {
+ public:
+  /**
+   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] stream a CUDA stream.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(rmm::cuda_stream_view stream)
+  {
+    get_token()->synchronize_impl(cudaStreamQuery, stream);
+  }
+
+  /**
+   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] event a CUDA event.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(cudaEvent_t event)
+  {
+    get_token()->synchronize_impl(cudaEventQuery, event);
+  }
+
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * This is a cancellation point for an interruptible thread. It's called in the internals of
+   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+   * unresponsive for too long.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread.
+   */
+  static inline void yield() { get_token()->yield_impl(); }
+
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+   */
+  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+
+  /**
+   * @brief Get a cancellation token for this CPU thread.
+   *
+   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+   */
+  static inline auto get_token() -> std::shared_ptr<interruptible>
+  {
+    // NB: using static thread-local storage to keep the token alive once it is initialized
+    static thread_local std::shared_ptr<interruptible> s(
+      get_token_impl<true>(std::this_thread::get_id()));
+    return s;
+  }
+
+  /**
+   * @brief Get a cancellation token for a CPU thread given by its id.
+   *
+   * The returned token may live longer than the associated thread. In that case, using its
+   * `cancel` method has no effect.
+   *
+   * @param [in] thread_id an id of a C++ CPU thread.
+   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+   */
+  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    return get_token_impl<false>(thread_id);
+  }
+
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by the `thread_id`
+   *
+   * Note, this function uses a mutex to safely get a cancellation token that may be shared
+   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+   * `cancel()` instead.
+   *
+   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+   */
+  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by this `interruptible` token.
+   *
+   * Note, this function does not involve thread synchronization/locks and does not throw any
+   * exceptions, so it's safe to call from a signal handler.
+   */
+  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+
+  // don't allow the token to leave the shared_ptr
+  interruptible(interruptible const&) = delete;
+  interruptible(interruptible&&)      = delete;
+  auto operator=(interruptible const&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible& = delete;
+
+ private:
+  /** Global registry of thread-local cancellation stores. */
+  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+  /** Protect the access to the registry. */
+  static inline std::mutex mutex_;
+
+  /**
+   * Create a new interruptible token or get an existing from the global registry_.
+   *
+   * Presumptions:
+   *
+   *   1. get_token_impl<true> must be called at most once per thread.
+   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+   *      token for any input thread_id, independent of whether a C++ thread with this
+   *      id exists or not.
+   *
+   * @tparam Claim whether to bind the token to the given thread.
+   * @param [in] thread_id the id of the associated C++ thread.
+   * @return new or existing interruptible token.
+   */
+  template <bool Claim>
+  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    std::lock_guard<std::mutex> guard_get(mutex_);
+    // the following constructs an empty shared_ptr if the key does not exist.
+    auto& weak_store  = registry_[thread_id];
+    auto thread_store = weak_store.lock();
+    if (!thread_store || (Claim && thread_store->claimed_)) {
+      // Create a new thread_store in two cases:
+      //  1. It does not exist in the map yet
+      //  2. The previous store in the map has not yet been deleted
+      thread_store.reset(new interruptible(), [thread_id](auto ts) {
+        std::lock_guard<std::mutex> guard_erase(mutex_);
+        auto found = registry_.find(thread_id);
+        if (found != registry_.end()) {
+          auto stored = found->second.lock();
+          // thread_store is not moveable, thus retains its original location.
+          // Not equal pointers below imply the new store has been already placed
+          // in the registry_ by the same std::thread::id
+          if (!stored || stored.get() == ts) { registry_.erase(found); }
+        }
+        delete ts;
+      });
+      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+    }
+    // The thread_store is "claimed" by the thread
+    if constexpr (Claim) { thread_store->claimed_ = true; }
+    return thread_store;
+  }
+
+  /**
+   * Communicate whether the thread is in a cancelled state or can continue execution.
+   *
+   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+   * These are the only two places where it's used.
+   */
+  std::atomic_flag continue_;
+  /** This flag is set to true when the created token is placed into a thread-local storage. */
+  bool claimed_ = false;
+
+  interruptible() noexcept { yield_no_throw_impl(); }
+
+  void yield_impl()
+  {
+    if (!yield_no_throw_impl()) {
+      throw interrupted_exception("The work in this thread was cancelled.");
+    }
+  }
+
+  auto yield_no_throw_impl() noexcept -> bool
+  {
+    return continue_.test_and_set(std::memory_order_relaxed);
+  }
+
+  template <typename Query, typename Object>
+  inline void synchronize_impl(Query query, Object object)
+  {
+    cudaError_t query_result;
+    while (true) {
+      yield_impl();
+      query_result = query(object);
+      if (query_result != cudaErrorNotReady) { break; }
+      std::this_thread::yield();
+    }
+    RAFT_CUDA_TRY(query_result);
+  }
+};
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
new file mode 100644
index 0000000000..9066e103d0
--- /dev/null
+++ b/cpp/include/raft/core/logger.hpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <stdarg.h>
+
+#define SPDLOG_HEADER_ONLY
+#include <raft/common/detail/callback_sink.hpp>
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
+#endif
+
+namespace raft {
+
+static const std::string RAFT_NAME = "raft";
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+/**
+ * @defgroup CStringFormat Expand a C-style format string
+ *
+ * @brief Expands C-style formatted string into std::string
+ *
+ * @param[in] fmt format string
+ * @param[in] vl  respective values for each of format modifiers in the string
+ *
+ * @return the expanded `std::string`
+ *
+ * @{
+ */
+std::string format(const char* fmt, va_list& vl)
+{
+  char buf[4096];
+  vsnprintf(buf, sizeof(buf), fmt, vl);
+  return std::string(buf);
+}
+
+std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
+/** @} */
+
+int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
+
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+    set_pattern(default_log_pattern);
+    set_level(RAFT_LEVEL_INFO);
+  }
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "")
+  {
+    if (log_map.find(name) == log_map.end()) {
+      log_map[name] = std::make_shared<raft::logger>(name);
+    }
+    return *log_map[name];
+  }
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level)
+  {
+    level = convert_level_to_spdlog(level);
+    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+  }
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern)
+  {
+    cur_pattern = pattern;
+    spdlogger->set_pattern(pattern);
+  }
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)()) { sink->set_flush(flush); }
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    return spdlogger->should_log(level_e);
+  }
+
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const
+  {
+    auto level_e = spdlogger->level();
+    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+  }
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const { return cur_pattern; }
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...)
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    // explicit check to make sure that we only expand messages when required
+    if (spdlogger->should_log(level_e)) {
+      va_list vl;
+      va_start(vl, fmt);
+      auto msg = format(fmt, vl);
+      va_end(vl);
+      spdlogger->log(level_e, msg);
+    }
+  }
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush() { spdlogger->flush(); }
+
+  ~logger() {}
+
+ private:
+  logger();
+
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+};  // class logger
+
+};  // namespace raft
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::format("%s:%d ", __FILE__, __LINE__);                     \
+    ss << raft::format(fmt, ##__VA_ARGS__);                               \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
new file mode 100644
index 0000000000..595c0161cd
--- /dev/null
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (2019) Sandia Corporation
+ *
+ * The source code is licensed under the 3-clause BSD license found in the LICENSE file
+ * thirdparty/LICENSES/mdarray.license
+ */
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <experimental/mdspan>
+#include <raft/core/handle.hpp>
+#include <raft/detail/mdarray.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft {
+/**
+ * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
+ */
+using layout_c_contiguous = detail::stdex::layout_right;
+
+/**
+ * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
+ */
+using layout_f_contiguous = detail::stdex::layout_left;
+
+/**
+ * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using device_mdspan = detail::stdex::
+  mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
+
+/**
+ * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using host_mdspan =
+  detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
+
+/**
+ * @brief Modified from the c++ mdarray proposal
+ *
+ *   https://isocpp.org/files/papers/D1684R0.html
+ *
+ * mdarray is a container type for mdspan with similar template arguments.  However there
+ * are some inconsistencies in between them.  We have made some modificiations to fit our
+ * needs, which are listed below.
+ *
+ * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just
+ *   like `mdspan`, while the `mdarray` in the reference implementation uses varidic
+ *   template.
+ *
+ * - Most of the constructors from the reference implementation is removed to make sure
+ *   CUDA stream is honorred.
+ *
+ * - unique_size is not implemented, which is still working in progress in the proposal
+ *
+ * - For container policy, we adopt the alternative approach documented in the proposal
+ *   [sec 2.4.3], which requires an additional make_accessor method for it to be used in
+ *   mdspan.  The container policy reference implementation has multiple `access` methods
+ *   that accommodate needs for both mdarray and mdspan.  This is more difficult for us
+ *   since the policy might contain states that are unwanted inside a CUDA kernel.  Also,
+ *   on host we return a proxy to the actual value as `device_ref` so different access
+ *   methods will have different return type, which is less desirable.
+ *
+ * - For the above reasons, copying from other mdarray with different policy type is also
+ *   removed.
+ */
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+class mdarray {
+  static_assert(!std::is_const<ElementType>::value,
+                "Element type for container must not be const.");
+
+ public:
+  using extents_type = Extents;
+  using layout_type  = LayoutPolicy;
+  using mapping_type = typename layout_type::template mapping<extents_type>;
+  using element_type = ElementType;
+
+  using value_type      = std::remove_cv_t<element_type>;
+  using index_type      = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  // Naming: ref impl: container_policy_type, proposal: container_policy
+  using container_policy_type = ContainerPolicy;
+  using container_type        = typename container_policy_type::container_type;
+
+  using pointer         = typename container_policy_type::pointer;
+  using const_pointer   = typename container_policy_type::const_pointer;
+  using reference       = typename container_policy_type::reference;
+  using const_reference = typename container_policy_type::const_reference;
+
+ private:
+  template <typename E,
+            typename ViewAccessorPolicy =
+              std::conditional_t<std::is_const_v<E>,
+                                 typename container_policy_type::const_accessor_policy,
+                                 typename container_policy_type::accessor_policy>>
+  using view_type_impl =
+    std::conditional_t<container_policy_type::is_host_type::value,
+                       host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
+                       device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
+
+ public:
+  /**
+   * \brief the mdspan type returned by view method.
+   */
+  using view_type       = view_type_impl<element_type>;
+  using const_view_type = view_type_impl<element_type const>;
+
+ public:
+  constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
+    : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
+  constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
+    default;
+  constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
+    default;
+
+  constexpr auto operator                                               =(mdarray const&) noexcept(
+    std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
+  constexpr auto operator                                               =(mdarray&&) noexcept(
+    std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
+
+  ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
+
+#ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
+#if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+// 11.0:
+// Error: Internal Compiler Error (codegen): "there was an error in verifying the lgenfe output!"
+//
+// 11.2:
+// Call parameter type does not match function signature!
+// i8** null
+// i8*  %call14 = call i32 null(void (i8*)* null, i8* null, i8** null), !dbg !1060
+// <unnamed>: parse Invalid record (Producer: 'LLVM7.0.1' Reader: 'LLVM 7.0.1')
+#define RAFT_MDARRAY_CTOR_CONSTEXPR constexpr
+#else
+#define RAFT_MDARRAY_CTOR_CONSTEXPR
+#endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+#endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
+
+  /**
+   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
+   * used.
+   */
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
+
+#undef RAFT_MDARRAY_CTOR_CONSTEXPR
+
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() const noexcept
+  {
+    return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
+  }
+
+  [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
+  [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices)
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible_v<mapping_type, extents_type>,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices) const
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible<mapping_type, extents_type>::value,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        const_reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
+  {
+    return extents_type::rank();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
+  {
+    return extents_type::rank_dynamic();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
+    -> index_type
+  {
+    return extents_type::static_extent(r);
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
+  {
+    return map_.extents();
+  }
+  /**
+   * @brief the extent of rank r
+   */
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
+  {
+    return map_.extents().extent(r);
+  }
+  // mapping
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
+  {
+    return map_;
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
+  {
+    return map_.is_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
+  {
+    return map_.is_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
+  {
+    return map_.is_strided();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
+  {
+    return map_.stride(r);
+  }
+
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
+  {
+    return mapping_type::is_always_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
+  {
+    return mapping_type::is_always_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
+  {
+    return mapping_type::is_always_strided();
+  }
+
+ private:
+  template <typename, typename, typename, typename>
+  friend class mdarray;
+
+ private:
+  container_policy_type cp_;
+  mapping_type map_;
+  container_type c_;
+};
+
+/**
+ * @brief mdarray with host container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::host_vector_policy<ElementType>>
+using host_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
+
+/**
+ * @brief mdarray with device container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
+using device_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+template <typename ElementType>
+using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+template <typename ElementType>
+using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
+ */
+template <typename ElementType>
+using host_vector = host_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ */
+template <typename ElementType>
+using device_vector = device_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous device matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+template <typename ElementType>
+using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+template <typename ElementType>
+using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdspan.
+ * @tparam ElementType the data type of the vector elements
+ */
+template <typename ElementType>
+using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdspan.
+ * @tparam ElementType the data type of the vector elements
+ */
+template <typename ElementType>
+using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous device matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for host value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
+ */
+template <typename ElementType>
+auto make_host_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return host_scalar_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for device value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
+ */
+template <typename ElementType>
+auto make_device_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return device_scalar_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on host to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for host pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on host to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::host_vector_view
+ */
+template <typename ElementType>
+auto make_host_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return host_vector_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for device pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on device to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::device_vector_view
+ */
+template <typename ElementType>
+auto make_device_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return device_vector_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous host mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::host_matrix
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix(size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename host_matrix<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @param[in] stream cuda stream for ordering events
+ * @return raft::device_matrix
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename device_matrix<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive resources
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::device_matrix
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+{
+  return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
+}
+
+/**
+ * @brief Create a host scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap
+ * @return raft::host_scalar
+ */
+template <typename ElementType>
+auto make_host_scalar(ElementType const& v)
+{
+  // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
+  // requires some more compile time dispatching. This is enabled in the ref impl but
+  // hasn't been ported here yet.
+  detail::scalar_extent extents;
+  using policy_t = typename host_scalar<ElementType>::container_policy_type;
+  policy_t policy;
+  auto scalar = host_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a device scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap on device
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_scalar
+ */
+template <typename ElementType>
+auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+{
+  detail::scalar_extent extents;
+  using policy_t = typename device_scalar<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  auto scalar = device_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a device scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] v scalar to wrap on device
+ * @return raft::device_scalar
+ */
+template <typename ElementType>
+auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+{
+  return make_device_scalar<ElementType>(v, handle.get_stream());
+}
+
+/**
+ * @brief Create a 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @return raft::host_vector
+ */
+template <typename ElementType>
+auto make_host_vector(size_t n)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename host_vector<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_vector<ElementType>{extents, policy};
+}
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_vector
+ */
+template <typename ElementType>
+auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename device_vector<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_vector<ElementType>{extents, policy};
+}
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] n number of elements in vector
+ * @return raft::device_vector
+ */
+template <typename ElementType>
+auto make_device_vector(raft::handle_t const& handle, size_t n)
+{
+  return make_device_vector<ElementType>(n, handle.get_stream());
+}
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
new file mode 100644
index 0000000000..eb536b0e01
--- /dev/null
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <optional>
+#include <raft/common/detail/nvtx.hpp>
+
+/**
+ * \section Usage
+ *
+ * To add NVTX ranges to your code, use the `nvtx::range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * The example below creates nested NVTX ranges. The range `fun_scope` spans
+ * the whole function, while the range `epoch_scope` spans an iteration
+ * (and appears 5 times in the timeline).
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ * void some_function(int k){
+ *   // Begins a NVTX range with the messsage "some_function_{k}"
+ *   // The range ends when some_function() returns
+ *   common::nvtx::range fun_scope( r{"some_function_%d", k};
+ *
+ *   for(int i = 0; i < 5; i++){
+ *     common::nvtx::range epoch_scope{"epoch-%d", i};
+ *     // some logic inside the loop
+ *   }
+ * }
+ * \endcode
+ *
+ * \section Domains
+ *
+ * All NVTX ranges are assigned to domains. A domain defines a named timeline in
+ * the Nsight Systems view. By default, we put all ranges into a domain `domain::app`
+ * named "application". This is controlled by the template parameter `Domain`.
+ *
+ * The example below defines a domain and uses it in a function.
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ *
+ * struct my_app_domain {
+ *   static constexpr char const* name{"my application"};
+ * }
+ *
+ * void some_function(int k){
+ *   // This NVTX range appears in the timeline named "my application" in Nsight Systems.
+ *   common::nvtx::range<my_app_domain> fun_scope( r{"some_function_%d", k};
+ *   // some logic inside the loop
+ * }
+ * \endcode
+ */
+namespace raft::common::nvtx {
+
+namespace domain {
+
+/** @brief The default NVTX domain. */
+struct app {
+  static constexpr char const* name{"application"};
+};
+
+/** @brief This NVTX domain is supposed to be used within raft.  */
+struct raft {
+  static constexpr char const* name{"raft"};
+};
+
+}  // namespace domain
+
+/**
+ * @brief Push a named NVTX range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ * @param format range name format (accepts printf-style arguments)
+ * @param args the arguments for the printf-style formatting
+ */
+template <typename Domain = domain::app, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+  detail::push_range<Domain, Args...>(format, args...);
+}
+
+/**
+ * @brief Pop the latest range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ */
+template <typename Domain = domain::app>
+inline void pop_range()
+{
+  detail::pop_range<Domain>();
+}
+
+/**
+ * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
+ *
+ * Refer to \ref Usage for the usage examples.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ */
+template <typename Domain = domain::app>
+class range {
+ public:
+  /**
+   * Push a named NVTX range.
+   * At the end of the object lifetime, pop the range back.
+   *
+   * @param format range name format (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  explicit range(const char* format, Args... args)
+  {
+    push_range<Domain, Args...>(format, args...);
+  }
+
+  ~range() { pop_range<Domain>(); }
+
+  /* This object is not meant to be touched. */
+  range(const range&) = delete;
+  range(range&&)      = delete;
+  auto operator=(const range&) -> range& = delete;
+  auto operator=(range&&) -> range&                = delete;
+  static auto operator new(std::size_t) -> void*   = delete;
+  static auto operator new[](std::size_t) -> void* = delete;
+};
+
+}  // namespace raft::common::nvtx
diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp
new file mode 100644
index 0000000000..b4fbf5b63a
--- /dev/null
+++ b/cpp/include/raft/core/span.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <cinttypes>  // size_t
+#include <cstddef>    // std::byte
+#include <raft/detail/span.hpp>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>  // __host__ __device__
+#include <thrust/iterator/reverse_iterator.h>
+#include <type_traits>
+
+namespace raft {
+/**
+ * @brief The span class defined in ISO C++20.  Iterator is defined as plain pointer and
+ *        most of the methods have bound check on debug build.
+ *
+ * @code
+ *   rmm::device_uvector<float> uvec(10, rmm::cuda_stream_default);
+ *   auto view = device_span<float>{uvec.data(), uvec.size()};
+ * @endcode
+ */
+template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
+class span {
+ public:
+  using element_type    = T;
+  using value_type      = typename std::remove_cv<T>::type;
+  using size_type       = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using pointer         = T*;
+  using const_pointer   = T const*;
+  using reference       = T&;
+  using const_reference = T const&;
+
+  using iterator               = pointer;
+  using const_iterator         = const_pointer;
+  using reverse_iterator       = thrust::reverse_iterator<iterator>;
+  using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
+
+  /**
+   * @brief Default constructor that constructs a span with size 0 and nullptr.
+   */
+  constexpr span() noexcept = default;
+
+  /**
+   * @brief Constructs a span that is a view over the range [first, first + count);
+   */
+  constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
+  {
+    assert(!(Extent != dynamic_extent && count != Extent));
+    assert(ptr || count == 0);
+  }
+  /**
+   * @brief Constructs a span that is a view over the range [first, last)
+   */
+  constexpr span(pointer first, pointer last) noexcept
+    : span{first, static_cast<size_type>(thrust::distance(first, last))}
+  {
+  }
+  /**
+   * @brief Constructs a span that is a view over the array arr.
+   */
+  template <std::size_t N>
+  constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
+  {
+  }
+
+  /**
+   * @brief Initialize a span class from another one who's underlying type is convertible
+   *        to element_type.
+   */
+  template <class U,
+            std::size_t OtherExtent,
+            class = typename std::enable_if<
+              detail::is_allowed_element_type_conversion_t<U, T>::value &&
+              detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
+  constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
+    : span{other.data(), other.size()}
+  {
+  }
+
+  constexpr span(span const& other) noexcept = default;
+  constexpr span(span&& other) noexcept      = default;
+
+  constexpr auto operator=(span const& other) noexcept -> span& = default;
+  constexpr auto operator=(span&& other) noexcept -> span& = default;
+
+  constexpr auto begin() const noexcept -> iterator { return data(); }
+
+  constexpr auto end() const noexcept -> iterator { return data() + size(); }
+
+  constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
+
+  constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
+
+  __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{end()};
+  }
+
+  __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{begin()};
+  }
+
+  __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cend()};
+  }
+
+  __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cbegin()};
+  }
+
+  // element access
+  constexpr auto front() const -> reference { return (*this)[0]; }
+
+  constexpr auto back() const -> reference { return (*this)[size() - 1]; }
+
+  template <typename Index>
+  constexpr auto operator[](Index _idx) const -> reference
+  {
+    assert(static_cast<size_type>(_idx) < size());
+    return data()[_idx];
+  }
+
+  constexpr auto data() const noexcept -> pointer { return storage_.data(); }
+
+  // Observers
+  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
+  [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
+  {
+    return size() * sizeof(T);
+  }
+
+  constexpr auto empty() const noexcept { return size() == 0; }
+
+  // Subviews
+  template <std::size_t Count>
+  constexpr auto first() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data(), Count};
+  }
+
+  constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return {data(), _count};
+  }
+
+  template <std::size_t Count>
+  constexpr auto last() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data() + size() - Count, Count};
+  }
+
+  constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return subspan(size() - _count, _count);
+  }
+
+  /*!
+   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
+   * Otherwise r.size() == Count.
+   */
+  template <std::size_t Offset, std::size_t Count = dynamic_extent>
+  constexpr auto subspan() const
+    -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
+  {
+    assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
+    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+  }
+
+  constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
+    -> span<element_type, is_device, dynamic_extent>
+  {
+    assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+  }
+
+ private:
+  detail::span_storage<T, Extent> storage_;
+};
+
+/**
+ * @brief A span class for host pointer.
+ */
+template <typename T, size_t extent = dynamic_extent>
+using host_span = span<T, false, extent>;
+
+/**
+ * @brief A span class for device pointer.
+ */
+template <typename T, size_t extent = dynamic_extent>
+using device_span = span<T, true, extent>;
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
+{
+  if (l.size() != r.size()) { return false; }
+  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
+    if (*l_beg != *r_beg) { return false; }
+  }
+  return true;
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l == r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::less<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l > r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::greater<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l < r);
+}
+
+/**
+ * @brief Converts a span into a view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_bytes(span<T, is_device, E> s) noexcept
+  -> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
+}
+
+/**
+ * @brief Converts a span into a mutable view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_writable_bytes(span<T, is_device, E> s) noexcept
+  -> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
+}
+}  // namespace raft
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 05fce6c0c4..b4549e11c9 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -16,410 +16,8 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
+ * Please use core/cudart_utils.hpp instead.
  */
 
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
 #pragma once
-
-#include <raft/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <cuda_runtime.h>
-
-#include <chrono>
-#include <cstdio>
-#include <execinfo.h>
-#include <iomanip>
-#include <iostream>
-#include <mutex>
-#include <unordered_map>
-
-///@todo: enable once logging has been enabled in raft
-//#include "logger.hpp"
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a CUDA error is encountered.
- */
-struct cuda_error : public raft::exception {
-  explicit cuda_error(char const* const message) : raft::exception(message) {}
-  explicit cuda_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-/**
- * @brief Error checking macro for CUDA runtime API functions.
- *
- * Invokes a CUDA runtime API function call, if the call does not return
- * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
- * exception detailing the CUDA error that occurred
- *
- */
-#define RAFT_CUDA_TRY(call)                        \
-  do {                                             \
-    cudaError_t const status = call;               \
-    if (status != cudaSuccess) {                   \
-      cudaGetLastError();                          \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "CUDA error encountered at: ", \
-                    "call='%s', Reason=%s:%s",     \
-                    #call,                         \
-                    cudaGetErrorName(status),      \
-                    cudaGetErrorString(status));   \
-      throw raft::cuda_error(msg);                 \
-    }                                              \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
-
-/**
- * @brief Debug macro to check for CUDA errors
- *
- * In a non-release build, this macro will synchronize the specified stream
- * before error checking. In both release and non-release builds, this macro
- * checks for any pending CUDA errors from previous calls. If an error is
- * reported, an exception is thrown detailing the CUDA error that occurred.
- *
- * The intent of this macro is to provide a mechanism for synchronous and
- * deterministic execution for debugging asynchronous CUDA execution. It should
- * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
- * asynchronous kernel launch.
- */
-#ifndef NDEBUG
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-#else
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
-#endif
-
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUDA_TRY_NO_THROW(call)                               \
-  do {                                                             \
-    cudaError_t const status = call;                               \
-    if (cudaSuccess != status) {                                   \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             cudaGetErrorString(status));                          \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
-namespace raft {
-
-/** Helper method to get to know warp size in device code */
-__host__ __device__ constexpr inline int warp_size() { return 32; }
-
-__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to threads.
- */
-class grid_1d_thread_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param elements_per_thread Typically, a single kernel thread processes more than a single
-   * element; this affects the number of threads the grid must contain
-   */
-  grid_1d_thread_t(size_t overall_num_elements,
-                   size_t num_threads_per_block,
-                   size_t max_num_blocks_1d,
-                   size_t elements_per_thread = 1)
-    : block_size(num_threads_per_block),
-      num_blocks(
-        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                   (elements_per_thread * num_threads_per_block),
-                 max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to warps.
- */
-class grid_1d_warp_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_warp_t(size_t overall_num_elements,
-                 size_t num_threads_per_block,
-                 size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                            (num_threads_per_block / warp_size()),
-                          max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to blocks.
- */
-class grid_1d_block_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_block_t(size_t overall_num_elements,
-                  size_t num_threads_per_block,
-                  size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief Generic copy method for all kinds of transfers
- * @tparam Type data type
- * @param dst destination pointer
- * @param src source pointer
- * @param len lenth of the src/dst buffers in terms of number of elements
- * @param stream cuda stream
- */
-template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-}
-
-/**
- * @defgroup Copy Copy methods
- * These are here along with the generic 'copy' method in order to improve
- * code readability using explicitly specified function names
- * @{
- */
-/** performs a host to device copy */
-template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(d_ptr, h_ptr, len, stream);
-}
-
-/** performs a device to host copy */
-template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(h_ptr, d_ptr, len, stream);
-}
-
-template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-}
-/** @} */
-
-/**
- * @defgroup Debug Utils for debugging host/device buffers
- * @{
- */
-template <class T, class OutStream>
-void print_host_vector(const char* variable_name,
-                       const T* host_mem,
-                       size_t componentsCount,
-                       OutStream& out)
-{
-  out << variable_name << "=[";
-  for (size_t i = 0; i < componentsCount; ++i) {
-    if (i != 0) out << ",";
-    out << host_mem[i];
-  }
-  out << "];\n";
-}
-
-template <class T, class OutStream>
-void print_device_vector(const char* variable_name,
-                         const T* devMem,
-                         size_t componentsCount,
-                         OutStream& out)
-{
-  T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem, componentsCount, out);
-  delete[] host_mem;
-}
-/** @} */
-
-/** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int smemPerBlk;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-  return smemPerBlk;
-}
-
-/** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int mpCount;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-  return mpCount;
-}
-
-/** helper method to convert an array on device to a string on host */
-template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-{
-  std::stringstream ss;
-
-  T* arr_h = (T*)malloc(size * sizeof(T));
-  update_host(arr_h, arr, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  ss << name << " = [ ";
-  for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
-
-    if (i < size - 1) ss << ", ";
-  }
-  ss << " ]" << std::endl;
-
-  free(arr_h);
-
-  return ss.str();
-}
-
-/** this seems to be unused, but may be useful in the future */
-template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-{
-  cudaPointerAttributes s_att;
-  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-  if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-              << ", err=" << s_err << std::endl;
-}
-
-inline uint32_t curTimeMillis()
-{
-  auto now      = std::chrono::high_resolution_clock::now();
-  auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-}
-
-/** Helper function to calculate need memory for allocate to store dense matrix.
- * @param rows number of rows in matrix
- * @param columns number of columns in matrix
- * @return need number of items to allocate via allocate()
- * @sa allocate()
- */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
-
-/** Helper function to check alignment of pointer.
- * @param ptr the pointer to check
- * @param alignment to be checked for
- * @return true if address in bytes is a multiple of alignment
- */
-template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment)
-{
-  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-}
-
-/** calculate greatest common divisor of two numbers
- * @a integer
- * @b integer
- * @ return gcd of a and b
- */
-template <typename IntType>
-IntType gcd(IntType a, IntType b)
-{
-  while (b != 0) {
-    IntType tmp = b;
-    b           = a % b;
-    a           = tmp;
-  }
-  return a;
-}
-
-template <typename T>
-constexpr T lower_bound()
-{
-  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
-    return -std::numeric_limits<T>::infinity();
-  }
-  return std::numeric_limits<T>::lowest();
-}
-
-template <typename T>
-constexpr T upper_bound()
-{
-  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
-  return std::numeric_limits<T>::max();
-}
-
-}  // namespace raft
-
-#endif
+#include <raft/core/cudart_utils.hpp>
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 0927142829..e109739781 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -16,166 +16,8 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/error.hpp instead.
+ * Please use the include/core/error.hpp instead.
  */
 
-#ifndef __RAFT_RT_ERROR
-#define __RAFT_RT_ERROR
-
 #pragma once
-
-#include <cstdio>
-#include <execinfo.h>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace raft {
-
-/** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  char const* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
-#ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];  // NOLINT
-    auto depth = backtrace(stack, kMaxStackDepth);
-    std::ostringstream oss;
-    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-    char** strings = backtrace_symbols(stack, depth);
-    if (strings == nullptr) {
-      oss << "But no stack trace could be found!" << std::endl;
-      msg_ += oss.str();
-      return;
-    }
-    ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) {
-      oss << "#" << i << " in " << strings[i] << std::endl;
-    }
-    free(strings);
-    msg_ += oss.str();
-#endif  // __GNUC__
-  }
-};
-
-/**
- * @brief Exception thrown when logical precondition is violated.
- *
- * This exception should not be thrown directly and is instead thrown by the
- * RAFT_EXPECTS and  RAFT_FAIL macros.
- *
- */
-struct logic_error : public raft::exception {
-  explicit logic_error(char const* const message) : raft::exception(message) {}
-  explicit logic_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-// FIXME: Need to be replaced with RAFT_FAIL
-/** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                      \
-  do {                                                                                       \
-    int size1 =                                                                              \
-      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
-    if (size1 < 0 || size2 < 0)                                                              \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
-    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
-    std::snprintf(buf.get(),                                                                 \
-                  size1 + 1 /* +1 for '\0' */,                                               \
-                  "exception occured! file=%s line=%d: ",                                    \
-                  __FILE__,                                                                  \
-                  __LINE__);                                                                 \
-    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
-    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-    throw raft::exception(msg);                                                              \
-  } while (0)
-
-// FIXME: Need to be replaced with RAFT_EXPECTS
-/** macro to check for a conditional and assert on failure */
-#define ASSERT(check, fmt, ...)              \
-  do {                                       \
-    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
-  } while (0)
-
-/**
- * Macro to append error message to first argument.
- * This should only be called in contexts where it is OK to throw exceptions!
- */
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
-  do {                                                                                          \
-    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
-    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
-    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
-    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
-    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
-    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
-    std::snprintf(                                                                              \
-      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
-    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
-    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-  } while (0)
-
-/**
- * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
- *
- * @param[in] cond Expression that evaluates to true or false
- * @param[in] fmt String literal description of the reason that cond is expected to be true with
- * optinal format tagas
- * @throw raft::logic_error if the condition evaluates to false.
- */
-#define RAFT_EXPECTS(cond, fmt, ...)                              \
-  do {                                                            \
-    if (!(cond)) {                                                \
-      std::string msg{};                                          \
-      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-      throw raft::logic_error(msg);                               \
-    }                                                             \
-  } while (0)
-
-/**
- * @brief Indicates that an erroneous code path has been taken.
- *
- * @param[in] fmt String literal description of the reason that this code path is erroneous with
- * optinal format tagas
- * @throw always throws raft::logic_error
- */
-#define RAFT_FAIL(fmt, ...)                                     \
-  do {                                                          \
-    std::string msg{};                                          \
-    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-    throw raft::logic_error(msg);                               \
-  } while (0)
-
-#endif
+#include <raft/core/error.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 6f049503c8..4525af49d2 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -16,329 +16,9 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/handle.hpp instead.
+ * Please use the include/core/handle.hpp instead.
  */
 
-#ifndef __RAFT_RT_HANDLE
-#define __RAFT_RT_HANDLE
-
 #pragma once
 
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-///@todo: enable once we have migrated cuml-comms layer too
-//#include <common/cuml_comms_int.hpp>
-
-#include "cudart_utils.h"
-
-#include <raft/comms/comms.hpp>
-#include <raft/interruptible.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/sparse/detail/cusparse_macros.h>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/exec_policy.hpp>
-
-namespace raft {
-
-/**
- * @brief Main handle object that stores all necessary context used for calling
- *        necessary cuda kernels and/or libraries
- */
-class handle_t {
- public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
-  handle_t& operator=(handle_t&&) = delete;
-
-  /**
-   * @brief Construct a handle with a stream view and stream pool
-   *
-   * @param[in] stream_view the default stream (which has the default per-thread stream if
-   * unspecified)
-   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-   */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
-  {
-    create_resources();
-  }
-
-  /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
-
-/**
- * @brief RAII approach to synchronizing across all streams in the handle
- */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
-
-}  // namespace raft
-
-#endif
+#include <raft/core/handle.hpp>
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 603f19ff35..3071f36531 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -16,261 +16,8 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/interruptible.hpp instead.
+ * Please use the include/core/interruptible.hpp instead.
  */
 
-#ifndef __RAFT_RT_INTERRUPTIBLE_H
-#define __RAFT_RT_INTERRUPTIBLE_H
-
 #pragma once
-
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <raft/cudart_utils.h>
-#include <raft/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <thread>
-#include <unordered_map>
-
-namespace raft {
-
-/**
- * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
- * to cancel the work performed in this CPU thread.
- */
-struct interrupted_exception : public raft::exception {
-  using raft::exception::exception;
-};
-
-/**
- * @brief Cooperative-style interruptible execution.
- *
- * This class provides facilities for interrupting execution of a C++ thread at designated points
- * in code from outside of the thread. In particular, it provides an interruptible version of the
- * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
- *
- *
- * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
- * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
- * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
- * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
- * responsibility of the developer then to make sure the unfinished stream work does not affect the
- * program in an undesirable way.
- *
- *
- * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
- * `interrupted_exception` immediately, you can safely wait on the stream again.
- * Otherwise, some of the allocated resources may be released before the active kernel finishes
- * using them, which will result in writing into deallocated or reallocated memory and undefined
- * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
- * practice, the outcome is usually acceptable for the use case of emergency program interruption
- * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
- * resuming of the GPU stream work.
- */
-class interruptible {
- public:
-  /**
-   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] stream a CUDA stream.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(rmm::cuda_stream_view stream)
-  {
-    get_token()->synchronize_impl(cudaStreamQuery, stream);
-  }
-
-  /**
-   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] event a CUDA event.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(cudaEvent_t event)
-  {
-    get_token()->synchronize_impl(cudaEventQuery, event);
-  }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * This is a cancellation point for an interruptible thread. It's called in the internals of
-   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-   * unresponsive for too long.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread.
-   */
-  static inline void yield() { get_token()->yield_impl(); }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-   */
-  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
-
-  /**
-   * @brief Get a cancellation token for this CPU thread.
-   *
-   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-   */
-  static inline auto get_token() -> std::shared_ptr<interruptible>
-  {
-    // NB: using static thread-local storage to keep the token alive once it is initialized
-    static thread_local std::shared_ptr<interruptible> s(
-      get_token_impl<true>(std::this_thread::get_id()));
-    return s;
-  }
-
-  /**
-   * @brief Get a cancellation token for a CPU thread given by its id.
-   *
-   * The returned token may live longer than the associated thread. In that case, using its
-   * `cancel` method has no effect.
-   *
-   * @param [in] thread_id an id of a C++ CPU thread.
-   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-   */
-  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    return get_token_impl<false>(thread_id);
-  }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by the `thread_id`
-   *
-   * Note, this function uses a mutex to safely get a cancellation token that may be shared
-   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-   * `cancel()` instead.
-   *
-   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-   */
-  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by this `interruptible` token.
-   *
-   * Note, this function does not involve thread synchronization/locks and does not throw any
-   * exceptions, so it's safe to call from a signal handler.
-   */
-  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
-
-  // don't allow the token to leave the shared_ptr
-  interruptible(interruptible const&) = delete;
-  interruptible(interruptible&&)      = delete;
-  auto operator=(interruptible const&) -> interruptible& = delete;
-  auto operator=(interruptible&&) -> interruptible& = delete;
-
- private:
-  /** Global registry of thread-local cancellation stores. */
-  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-  /** Protect the access to the registry. */
-  static inline std::mutex mutex_;
-
-  /**
-   * Create a new interruptible token or get an existing from the global registry_.
-   *
-   * Presumptions:
-   *
-   *   1. get_token_impl<true> must be called at most once per thread.
-   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-   *      token for any input thread_id, independent of whether a C++ thread with this
-   *      id exists or not.
-   *
-   * @tparam Claim whether to bind the token to the given thread.
-   * @param [in] thread_id the id of the associated C++ thread.
-   * @return new or existing interruptible token.
-   */
-  template <bool Claim>
-  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    std::lock_guard<std::mutex> guard_get(mutex_);
-    // the following constructs an empty shared_ptr if the key does not exist.
-    auto& weak_store  = registry_[thread_id];
-    auto thread_store = weak_store.lock();
-    if (!thread_store || (Claim && thread_store->claimed_)) {
-      // Create a new thread_store in two cases:
-      //  1. It does not exist in the map yet
-      //  2. The previous store in the map has not yet been deleted
-      thread_store.reset(new interruptible(), [thread_id](auto ts) {
-        std::lock_guard<std::mutex> guard_erase(mutex_);
-        auto found = registry_.find(thread_id);
-        if (found != registry_.end()) {
-          auto stored = found->second.lock();
-          // thread_store is not moveable, thus retains its original location.
-          // Not equal pointers below imply the new store has been already placed
-          // in the registry_ by the same std::thread::id
-          if (!stored || stored.get() == ts) { registry_.erase(found); }
-        }
-        delete ts;
-      });
-      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-    }
-    // The thread_store is "claimed" by the thread
-    if constexpr (Claim) { thread_store->claimed_ = true; }
-    return thread_store;
-  }
-
-  /**
-   * Communicate whether the thread is in a cancelled state or can continue execution.
-   *
-   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-   * These are the only two places where it's used.
-   */
-  std::atomic_flag continue_;
-  /** This flag is set to true when the created token is placed into a thread-local storage. */
-  bool claimed_ = false;
-
-  interruptible() noexcept { yield_no_throw_impl(); }
-
-  void yield_impl()
-  {
-    if (!yield_no_throw_impl()) {
-      throw interrupted_exception("The work in this thread was cancelled.");
-    }
-  }
-
-  auto yield_no_throw_impl() noexcept -> bool
-  {
-    return continue_.test_and_set(std::memory_order_relaxed);
-  }
-
-  template <typename Query, typename Object>
-  inline void synchronize_impl(Query query, Object object)
-  {
-    cudaError_t query_result;
-    while (true) {
-      yield_impl();
-      query_result = query(object);
-      if (query_result != cudaErrorNotReady) { break; }
-      std::this_thread::yield();
-    }
-    RAFT_CUDA_TRY(query_result);
-  }
-};
-
-}  // namespace raft
-
-#endif
+#include <raft/core/interruptible.hpp>
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index a321a080c8..78b31ab632 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -13,114 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cublas_macros.hpp instead.
+ * Please use core/cublas_macros.hpp instead.
  */
 
-#ifndef __RAFT_RT_CUBLAS_MACROS_H
-#define __RAFT_RT_CUBLAS_MACROS_H
-
 #pragma once
 
-#include <cublas_v2.h>
-#include <raft/error.hpp>
-
-///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
-
-#include <cstdint>
-
-#define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuBLAS error is encountered.
- */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUBLAS_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuBLAS runtime API functions.
- *
- * Invokes a cuBLAS runtime API function call, if the call does not return
- * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
- */
-#define RAFT_CUBLAS_TRY(call)                                              \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUBLAS_TRY
-#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
-  do {                                                               \
-    cublasStatus_t const status = call;                              \
-    if (CUBLAS_STATUS_SUCCESS != status) {                           \
-      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                  \
-             __FILE__,                                               \
-             __LINE__,                                               \
-             raft::linalg::detail::cublas_error_to_string(status));  \
-    }                                                                \
-  } while (0)
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK_NO_THROW
-#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
-#endif
-
-#endif
+#include <raft/core/cublas_macros.hpp>
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index fa1cd3d4c7..192d0b54f5 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -13,110 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cusolver_macros.hpp instead.
+ * Please use core/cusolver_macros.hpp instead.
  */
 
-#ifndef __RAFT_RT_CUSOLVER_MACROS_H
-#define __RAFT_RT_CUSOLVER_MACROS_H
-
 #pragma once
 
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-#include <raft/cudart_utils.h>
-#include <type_traits>
-
-#define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSOLVER error is encountered.
- */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUSOLVER_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSOLVER runtime API functions.
- *
- * Invokes a cuSOLVER runtime API function call, if the call does not return
- * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
- */
-#define RAFT_CUSOLVER_TRY(call)                                              \
-  do {                                                                       \
-    cusolverStatus_t const status = (call);                                  \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::linalg::detail::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: remove after consumer rename
-#ifndef CUSOLVER_TRY
-#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
-  do {                                                                 \
-    cusolverStatus_t const status = call;                              \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
-      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                    \
-             __FILE__,                                                 \
-             __LINE__,                                                 \
-             raft::linalg::detail::cusolver_error_to_string(status));  \
-    }                                                                  \
-  } while (0)
-
-// FIXME: remove after cuml rename
-#ifndef CUSOLVER_CHECK
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
-#endif
-
-#ifndef CUSOLVER_CHECK_NO_THROW
-#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
-#endif
-
-#endif
+#include <raft/core/cusolver_macros.hpp>
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 7f9abc324e..a55e1d6d7c 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -17,104 +17,11 @@
 #pragma once
 
 #include <cublas_v2.h>
+#include <raft/core/cublas_macros.hpp>
 #include <raft/error.hpp>
 
-#include <cublas_v2.h>
-///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
-
 #include <cstdint>
-
-#define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuBLAS error is encountered.
- */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUBLAS_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuBLAS runtime API functions.
- *
- * Invokes a cuBLAS runtime API function call, if the call does not return
- * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
- */
-#define RAFT_CUBLAS_TRY(call)                                              \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUBLAS_TRY
-#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
-  do {                                                               \
-    cublasStatus_t const status = call;                              \
-    if (CUBLAS_STATUS_SUCCESS != status) {                           \
-      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                  \
-             __FILE__,                                               \
-             __LINE__,                                               \
-             raft::linalg::detail::cublas_error_to_string(status));  \
-    }                                                                \
-  } while (0)
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK_NO_THROW
-#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
-#endif
+#include <cublas_v2.h>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 34ec6cb673..e7da615748 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -18,101 +18,10 @@
 
 #include <cusolverDn.h>
 #include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
+#include <raft/core/cusolver_macros.hpp>
 #include <raft/cudart_utils.h>
 #include <type_traits>
 
-#define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSOLVER error is encountered.
- */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUSOLVER_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSOLVER runtime API functions.
- *
- * Invokes a cuSOLVER runtime API function call, if the call does not return
- * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
- */
-#define RAFT_CUSOLVER_TRY(call)                                              \
-  do {                                                                       \
-    cusolverStatus_t const status = (call);                                  \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::linalg::detail::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: remove after consumer rename
-#ifndef CUSOLVER_TRY
-#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
-  do {                                                                 \
-    cusolverStatus_t const status = call;                              \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
-      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                    \
-             __FILE__,                                                 \
-             __LINE__,                                                 \
-             raft::linalg::detail::cusolver_error_to_string(status));  \
-    }                                                                  \
-  } while (0)
-
-// FIXME: remove after cuml rename
-#ifndef CUSOLVER_CHECK
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
-#endif
-
-#ifndef CUSOLVER_CHECK_NO_THROW
-#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
-#endif
-
 namespace raft {
 namespace linalg {
 namespace detail {
diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
index 7c3a1a02fa..4249c35542 100644
--- a/cpp/include/raft/mdarray.hpp
+++ b/cpp/include/raft/mdarray.hpp
@@ -1,10 +1,3 @@
-/*
- * Copyright (2019) Sandia Corporation
- *
- * The source code is licensed under the 3-clause BSD license found in the LICENSE file
- * thirdparty/LICENSES/mdarray.license
- */
-
 /*
  * Copyright (c) 2022, NVIDIA CORPORATION.
  *
@@ -20,631 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-#include <experimental/mdspan>
-#include <raft/detail/mdarray.hpp>
-#include <raft/handle.hpp>
-#include <rmm/cuda_stream_view.hpp>
-
-namespace raft {
-/**
- * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
- */
-using layout_c_contiguous = detail::stdex::layout_right;
-
-/**
- * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
- */
-using layout_f_contiguous = detail::stdex::layout_left;
-
-/**
- * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy   = layout_c_contiguous,
-          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-using device_mdspan = detail::stdex::
-  mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
-
-/**
- * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy   = layout_c_contiguous,
-          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-using host_mdspan =
-  detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
-
-/**
- * @brief Modified from the c++ mdarray proposal
- *
- *   https://isocpp.org/files/papers/D1684R0.html
- *
- * mdarray is a container type for mdspan with similar template arguments.  However there
- * are some inconsistencies in between them.  We have made some modificiations to fit our
- * needs, which are listed below.
- *
- * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just
- *   like `mdspan`, while the `mdarray` in the reference implementation uses varidic
- *   template.
- *
- * - Most of the constructors from the reference implementation is removed to make sure
- *   CUDA stream is honorred.
- *
- * - unique_size is not implemented, which is still working in progress in the proposal
- *
- * - For container policy, we adopt the alternative approach documented in the proposal
- *   [sec 2.4.3], which requires an additional make_accessor method for it to be used in
- *   mdspan.  The container policy reference implementation has multiple `access` methods
- *   that accommodate needs for both mdarray and mdspan.  This is more difficult for us
- *   since the policy might contain states that are unwanted inside a CUDA kernel.  Also,
- *   on host we return a proxy to the actual value as `device_ref` so different access
- *   methods will have different return type, which is less desirable.
- *
- * - For the above reasons, copying from other mdarray with different policy type is also
- *   removed.
- */
-template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
-class mdarray {
-  static_assert(!std::is_const<ElementType>::value,
-                "Element type for container must not be const.");
-
- public:
-  using extents_type = Extents;
-  using layout_type  = LayoutPolicy;
-  using mapping_type = typename layout_type::template mapping<extents_type>;
-  using element_type = ElementType;
-
-  using value_type      = std::remove_cv_t<element_type>;
-  using index_type      = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  // Naming: ref impl: container_policy_type, proposal: container_policy
-  using container_policy_type = ContainerPolicy;
-  using container_type        = typename container_policy_type::container_type;
-
-  using pointer         = typename container_policy_type::pointer;
-  using const_pointer   = typename container_policy_type::const_pointer;
-  using reference       = typename container_policy_type::reference;
-  using const_reference = typename container_policy_type::const_reference;
-
- private:
-  template <typename E,
-            typename ViewAccessorPolicy =
-              std::conditional_t<std::is_const_v<E>,
-                                 typename container_policy_type::const_accessor_policy,
-                                 typename container_policy_type::accessor_policy>>
-  using view_type_impl =
-    std::conditional_t<container_policy_type::is_host_type::value,
-                       host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
-                       device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
-
- public:
-  /**
-   * \brief the mdspan type returned by view method.
-   */
-  using view_type       = view_type_impl<element_type>;
-  using const_view_type = view_type_impl<element_type const>;
-
- public:
-  constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
-    : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
-  constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
-    default;
-  constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
-    default;
-
-  constexpr auto operator                                               =(mdarray const&) noexcept(
-    std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
-  constexpr auto operator                                               =(mdarray&&) noexcept(
-    std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
-
-  ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
-
-#ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
-#if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
-// 11.0:
-// Error: Internal Compiler Error (codegen): "there was an error in verifying the lgenfe output!"
-//
-// 11.2:
-// Call parameter type does not match function signature!
-// i8** null
-// i8*  %call14 = call i32 null(void (i8*)* null, i8* null, i8** null), !dbg !1060
-// <unnamed>: parse Invalid record (Producer: 'LLVM7.0.1' Reader: 'LLVM 7.0.1')
-#define RAFT_MDARRAY_CTOR_CONSTEXPR constexpr
-#else
-#define RAFT_MDARRAY_CTOR_CONSTEXPR
-#endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
-#endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
-
-  /**
-   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
-   * used.
-   */
-  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
-    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-  {
-  }
-  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
-    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-  {
-  }
-
-#undef RAFT_MDARRAY_CTOR_CONSTEXPR
-
-  /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
-   */
-  auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
-  /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
-   */
-  auto view() const noexcept
-  {
-    return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
-  }
-
-  [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
-
-  [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
-  [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
-
-  /**
-   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-   */
-  template <typename... IndexType>
-  auto operator()(IndexType&&... indices)
-    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                          (std::is_convertible_v<IndexType, index_type> && ...) &&
-                          std::is_constructible_v<extents_type, IndexType...> &&
-                          std::is_constructible_v<mapping_type, extents_type>,
-                        /* device policy is not default constructible due to requirement for CUDA
-                           stream. */
-                        /* std::is_default_constructible_v<container_policy_type> */
-                        reference>
-  {
-    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-  }
-
-  /**
-   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-   */
-  template <typename... IndexType>
-  auto operator()(IndexType&&... indices) const
-    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                          (std::is_convertible_v<IndexType, index_type> && ...) &&
-                          std::is_constructible_v<extents_type, IndexType...> &&
-                          std::is_constructible<mapping_type, extents_type>::value,
-                        /* device policy is not default constructible due to requirement for CUDA
-                           stream. */
-                        /* std::is_default_constructible_v<container_policy_type> */
-                        const_reference>
-  {
-    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-  }
-
-  // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
-  {
-    return extents_type::rank();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
-  {
-    return extents_type::rank_dynamic();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
-    -> index_type
-  {
-    return extents_type::static_extent(r);
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
-  {
-    return map_.extents();
-  }
-  /**
-   * @brief the extent of rank r
-   */
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
-  {
-    return map_.extents().extent(r);
-  }
-  // mapping
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
-  {
-    return map_;
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
-  {
-    return map_.is_unique();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
-  {
-    return map_.is_contiguous();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
-  {
-    return map_.is_strided();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
-  {
-    return map_.stride(r);
-  }
-
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
-  {
-    return mapping_type::is_always_unique();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
-  {
-    return mapping_type::is_always_contiguous();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
-  {
-    return mapping_type::is_always_strided();
-  }
-
- private:
-  template <typename, typename, typename, typename>
-  friend class mdarray;
-
- private:
-  container_policy_type cp_;
-  mapping_type map_;
-  container_type c_;
-};
-
-/**
- * @brief mdarray with host container policy
- * @tparam ElementType the data type of the elements
- * @tparam Extents defines the shape
- * @tparam LayoutPolicy policy for indexing strides and layout ordering
- * @tparam ContainerPolicy storage and accessor policy
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy    = layout_c_contiguous,
-          typename ContainerPolicy = detail::host_vector_policy<ElementType>>
-using host_mdarray =
-  mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
-
-/**
- * @brief mdarray with device container policy
- * @tparam ElementType the data type of the elements
- * @tparam Extents defines the shape
- * @tparam LayoutPolicy policy for indexing strides and layout ordering
- * @tparam ContainerPolicy storage and accessor policy
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy    = layout_c_contiguous,
-          typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
-using device_mdarray =
-  mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
-
-/**
- * @brief Shorthand for 0-dim host mdarray (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 0-dim host mdarray (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 1-dim host mdarray.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using host_vector = host_mdarray<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using device_vector = device_mdarray<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for c-contiguous host matrix.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for c-contiguous device matrix.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for 0-dim host mdspan (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 0-dim host mdspan (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 1-dim host mdspan.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for 1-dim device mdspan.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for c-contiguous host matrix view.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- *
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for c-contiguous device matrix view.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- *
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Create a 0-dim (scalar) mdspan instance for host value.
- *
- * @tparam ElementType the data type of the matrix elements
- * @param[in] ptr on device to wrap
- */
-template <typename ElementType>
-auto make_host_scalar_view(ElementType* ptr)
-{
-  detail::scalar_extent extents;
-  return host_scalar_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 0-dim (scalar) mdspan instance for device value.
- *
- * @tparam ElementType the data type of the matrix elements
- * @param[in] ptr on device to wrap
- */
-template <typename ElementType>
-auto make_device_scalar_view(ElementType* ptr)
-{
-  detail::scalar_extent extents;
-  return device_scalar_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
- *        expected that the given layout policy match the layout of the underlying
- *        pointer.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] ptr on host to wrap
- * @param[in] n_rows number of rows in pointer
- * @param[in] n_cols number of columns in pointer
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-}
-/**
- * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
- *        expected that the given layout policy match the layout of the underlying
- *        pointer.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] ptr on device to wrap
- * @param[in] n_rows number of rows in pointer
- * @param[in] n_cols number of columns in pointer
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-}
 
 /**
- * @brief Create a 1-dim mdspan instance for host pointer.
- * @tparam ElementType the data type of the vector elements
- * @param[in] ptr on host to wrap
- * @param[in] n number of elements in pointer
- * @return raft::host_vector_view
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use include/core/mdarray.hpp instead.
  */
-template <typename ElementType>
-auto make_host_vector_view(ElementType* ptr, size_t n)
-{
-  detail::vector_extent extents{n};
-  return host_vector_view<ElementType>{ptr, extents};
-}
 
-/**
- * @brief Create a 1-dim mdspan instance for device pointer.
- * @tparam ElementType the data type of the vector elements
- * @param[in] ptr on device to wrap
- * @param[in] n number of elements in pointer
- * @return raft::device_vector_view
- */
-template <typename ElementType>
-auto make_device_vector_view(ElementType* ptr, size_t n)
-{
-  detail::vector_extent extents{n};
-  return device_vector_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous host mdarray.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @return raft::host_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix(size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename host_matrix<ElementType>::container_policy_type;
-  policy_t policy;
-  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous device mdarray.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @param[in] stream cuda stream for ordering events
- * @return raft::device_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename device_matrix<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous device mdarray.
- *
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] handle raft handle for managing expensive resources
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @return raft::device_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
-{
-  return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
-}
-
-/**
- * @brief Create a host scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] v scalar type to wrap
- * @return raft::host_scalar
- */
-template <typename ElementType>
-auto make_host_scalar(ElementType const& v)
-{
-  // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
-  // requires some more compile time dispatching. This is enabled in the ref impl but
-  // hasn't been ported here yet.
-  detail::scalar_extent extents;
-  using policy_t = typename host_scalar<ElementType>::container_policy_type;
-  policy_t policy;
-  auto scalar = host_scalar<ElementType>{extents, policy};
-  scalar(0)   = v;
-  return scalar;
-}
-
-/**
- * @brief Create a device scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] v scalar type to wrap on device
- * @param[in] stream the cuda stream for ordering events
- * @return raft::device_scalar
- */
-template <typename ElementType>
-auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
-{
-  detail::scalar_extent extents;
-  using policy_t = typename device_scalar<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  auto scalar = device_scalar<ElementType>{extents, policy};
-  scalar(0)   = v;
-  return scalar;
-}
-
-/**
- * @brief Create a device scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] handle raft handle for managing expensive cuda resources
- * @param[in] v scalar to wrap on device
- * @return raft::device_scalar
- */
-template <typename ElementType>
-auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
-{
-  return make_device_scalar<ElementType>(v, handle.get_stream());
-}
-
-/**
- * @brief Create a 1-dim host mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] n number of elements in vector
- * @return raft::host_vector
- */
-template <typename ElementType>
-auto make_host_vector(size_t n)
-{
-  detail::vector_extent extents{n};
-  using policy_t = typename host_vector<ElementType>::container_policy_type;
-  policy_t policy;
-  return host_vector<ElementType>{extents, policy};
-}
-
-/**
- * @brief Create a 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] n number of elements in vector
- * @param[in] stream the cuda stream for ordering events
- * @return raft::device_vector
- */
-template <typename ElementType>
-auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
-{
-  detail::vector_extent extents{n};
-  using policy_t = typename device_vector<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  return device_vector<ElementType>{extents, policy};
-}
-
-/**
- * @brief Create a 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] handle raft handle for managing expensive cuda resources
- * @param[in] n number of elements in vector
- * @return raft::device_vector
- */
-template <typename ElementType>
-auto make_device_vector(raft::handle_t const& handle, size_t n)
-{
-  return make_device_vector<ElementType>(n, handle.get_stream());
-}
-}  // namespace raft
+#pragma once
+#include <raft/core/mdarray.hpp>
diff --git a/cpp/include/raft/span.hpp b/cpp/include/raft/span.hpp
index b4fbf5b63a..5462f45618 100644
--- a/cpp/include/raft/span.hpp
+++ b/cpp/include/raft/span.hpp
@@ -13,270 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <cassert>
-#include <cinttypes>  // size_t
-#include <cstddef>    // std::byte
-#include <raft/detail/span.hpp>
-#include <thrust/functional.h>
-#include <thrust/host_vector.h>  // __host__ __device__
-#include <thrust/iterator/reverse_iterator.h>
-#include <type_traits>
-
-namespace raft {
-/**
- * @brief The span class defined in ISO C++20.  Iterator is defined as plain pointer and
- *        most of the methods have bound check on debug build.
- *
- * @code
- *   rmm::device_uvector<float> uvec(10, rmm::cuda_stream_default);
- *   auto view = device_span<float>{uvec.data(), uvec.size()};
- * @endcode
- */
-template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
-class span {
- public:
-  using element_type    = T;
-  using value_type      = typename std::remove_cv<T>::type;
-  using size_type       = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using pointer         = T*;
-  using const_pointer   = T const*;
-  using reference       = T&;
-  using const_reference = T const&;
-
-  using iterator               = pointer;
-  using const_iterator         = const_pointer;
-  using reverse_iterator       = thrust::reverse_iterator<iterator>;
-  using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
-
-  /**
-   * @brief Default constructor that constructs a span with size 0 and nullptr.
-   */
-  constexpr span() noexcept = default;
-
-  /**
-   * @brief Constructs a span that is a view over the range [first, first + count);
-   */
-  constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
-  {
-    assert(!(Extent != dynamic_extent && count != Extent));
-    assert(ptr || count == 0);
-  }
-  /**
-   * @brief Constructs a span that is a view over the range [first, last)
-   */
-  constexpr span(pointer first, pointer last) noexcept
-    : span{first, static_cast<size_type>(thrust::distance(first, last))}
-  {
-  }
-  /**
-   * @brief Constructs a span that is a view over the array arr.
-   */
-  template <std::size_t N>
-  constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
-  {
-  }
-
-  /**
-   * @brief Initialize a span class from another one who's underlying type is convertible
-   *        to element_type.
-   */
-  template <class U,
-            std::size_t OtherExtent,
-            class = typename std::enable_if<
-              detail::is_allowed_element_type_conversion_t<U, T>::value &&
-              detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
-  constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
-    : span{other.data(), other.size()}
-  {
-  }
-
-  constexpr span(span const& other) noexcept = default;
-  constexpr span(span&& other) noexcept      = default;
-
-  constexpr auto operator=(span const& other) noexcept -> span& = default;
-  constexpr auto operator=(span&& other) noexcept -> span& = default;
-
-  constexpr auto begin() const noexcept -> iterator { return data(); }
-
-  constexpr auto end() const noexcept -> iterator { return data() + size(); }
-
-  constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
-
-  constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
-
-  __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
-  {
-    return reverse_iterator{end()};
-  }
-
-  __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
-  {
-    return reverse_iterator{begin()};
-  }
-
-  __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
-  {
-    return const_reverse_iterator{cend()};
-  }
-
-  __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
-  {
-    return const_reverse_iterator{cbegin()};
-  }
-
-  // element access
-  constexpr auto front() const -> reference { return (*this)[0]; }
-
-  constexpr auto back() const -> reference { return (*this)[size() - 1]; }
-
-  template <typename Index>
-  constexpr auto operator[](Index _idx) const -> reference
-  {
-    assert(static_cast<size_type>(_idx) < size());
-    return data()[_idx];
-  }
-
-  constexpr auto data() const noexcept -> pointer { return storage_.data(); }
-
-  // Observers
-  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
-  [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
-  {
-    return size() * sizeof(T);
-  }
-
-  constexpr auto empty() const noexcept { return size() == 0; }
-
-  // Subviews
-  template <std::size_t Count>
-  constexpr auto first() const -> span<element_type, is_device, Count>
-  {
-    assert(Count <= size());
-    return {data(), Count};
-  }
-
-  constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-  {
-    assert(_count <= size());
-    return {data(), _count};
-  }
-
-  template <std::size_t Count>
-  constexpr auto last() const -> span<element_type, is_device, Count>
-  {
-    assert(Count <= size());
-    return {data() + size() - Count, Count};
-  }
-
-  constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-  {
-    assert(_count <= size());
-    return subspan(size() - _count, _count);
-  }
-
-  /*!
-   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
-   * Otherwise r.size() == Count.
-   */
-  template <std::size_t Offset, std::size_t Count = dynamic_extent>
-  constexpr auto subspan() const
-    -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
-  {
-    assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
-    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
-  }
-
-  constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
-    -> span<element_type, is_device, dynamic_extent>
-  {
-    assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
-    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
-  }
-
- private:
-  detail::span_storage<T, Extent> storage_;
-};
-
-/**
- * @brief A span class for host pointer.
- */
-template <typename T, size_t extent = dynamic_extent>
-using host_span = span<T, false, extent>;
 
 /**
- * @brief A span class for device pointer.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use include/core/span.hpp instead.
  */
-template <typename T, size_t extent = dynamic_extent>
-using device_span = span<T, true, extent>;
 
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
-{
-  if (l.size() != r.size()) { return false; }
-  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
-    if (*l_beg != *r_beg) { return false; }
-  }
-  return true;
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l == r);
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return detail::lexicographical_compare<
-    typename span<T, is_device, X>::iterator,
-    typename span<U, is_device, Y>::iterator,
-    thrust::less<typename span<T, is_device, X>::element_type>>(
-    l.begin(), l.end(), r.begin(), r.end());
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l > r);
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return detail::lexicographical_compare<
-    typename span<T, is_device, X>::iterator,
-    typename span<U, is_device, Y>::iterator,
-    thrust::greater<typename span<T, is_device, X>::element_type>>(
-    l.begin(), l.end(), r.begin(), r.end());
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l < r);
-}
-
-/**
- * @brief Converts a span into a view of its underlying bytes
- */
-template <class T, bool is_device, std::size_t E>
-auto as_bytes(span<T, is_device, E> s) noexcept
-  -> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
-{
-  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
-}
-
-/**
- * @brief Converts a span into a mutable view of its underlying bytes
- */
-template <class T, bool is_device, std::size_t E>
-auto as_writable_bytes(span<T, is_device, E> s) noexcept
-  -> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
-{
-  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
-}
-}  // namespace raft
+#pragma once
+#include <raft/core/span.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index 10c7e8836c..e7d81f51aa 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -13,111 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include <cusparse.h>
-#include <raft/error.hpp>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-
-#define _CUSPARSE_ERR_TO_STR(err) \
-  case err: return #err;
-
-// Notes:
-//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
-//(2.) to enforce a lower version,
-//
-//`#define CUDA_ENFORCE_LOWER
-// #include <raft/sparse/detail/cusparse_wrappers.h>`
-//
-// (i.e., before including this header)
-//
-#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSparse error is encountered.
- */
-struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message) : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace sparse {
-namespace detail {
-
-inline const char* cusparse_error_to_string(cusparseStatus_t err)
-{
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-  return cusparseGetErrorString(err);
-#else   // CUDART_VERSION
-  switch (err) {
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default: return "CUSPARSE_STATUS_UNKNOWN";
-  };
-#endif  // CUDART_VERSION
-}
-
-}  // namespace detail
-}  // namespace sparse
-}  // namespace raft
-
-#undef _CUSPARSE_ERR_TO_STR
-
 /**
- * @brief Error checking macro for cuSparse runtime API functions.
- *
- * Invokes a cuSparse runtime API function call, if the call does not return
- * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-#define RAFT_CUSPARSE_TRY(call)                                              \
-  do {                                                                       \
-    cusparseStatus_t const status = (call);                                  \
-    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSparse error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::sparse::detail::cusparse_error_to_string(status)); \
-      throw raft::cusparse_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_TRY
-#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
-#endif
 
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK
-#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
-#endif
-
-//@todo: use logger here once logging is enabled
-/** check for cusparse runtime API errors but do not assert */
-#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
-  do {                                                             \
-    cusparseStatus_t err = call;                                   \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
-             #call,                                                \
-             err,                                                  \
-             raft::sparse::detail::cusparse_error_to_string(err)); \
-    }                                                              \
-  } while (0)
+#pragma once
 
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK_NO_THROW
-#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
-#endif
+#include <raft/core/cusparse_macros.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 2d2fabd9d6..d157a57f52 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -172,4 +172,4 @@ inline void select_k(key_t* inK,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c03e5d6bcd..354b5e8fc4 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -164,7 +164,6 @@ target_link_libraries(test_raft
         raft::raft
         raft::distance
         raft::nn
-        faiss::faiss
         GTest::gtest
         GTest::gtest_main
         Threads::Threads
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 70b83fad35..d028da122e 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -25,6 +25,10 @@
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.cuh>
+#endif
+
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
index 30e8903f29..7a7cdae086 100644
--- a/docs/source/cuda_cpp.rst
+++ b/docs/source/cuda_cpp.rst
@@ -1,7 +1,7 @@
 CUDA/C++ API
 ============
 
-RAFT is header-only but provides optional shared libraries to speed up compile times for larger projects.
+RAFT is a header-only C++ library with optional pre-compiled shared libraries that can speed up compile times for larger projects.
 
 .. _api:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d047543c13..97c616dd8e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,7 @@
 Welcome to RAFT's documentation!
 =================================
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning.
+RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning.
 
 .. toctree::
    :maxdepth: 2